tcp: fast retransmit improvements

Patch is too large to be ported to 18.10 just days before release.

- handle fast retransmits outside of established node and limit the
retransmit burst size to avoid tx losses and worsening congestion.
- in the absance of a tx pacer, use slow start after fast retransmit
exists
- add fast retransmit heuristic that re-retries sending the first
segment if everything else fails
- fine tuning

Change-Id: I84a2ab8fbba8b97f1d2b26584dc11a1e2c33c8d2
Signed-off-by: Florin Coras <fcoras@cisco.com>
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
index e32b5c4..cb05b8c 100644
--- a/src/vnet/tcp/tcp.c
+++ b/src/vnet/tcp/tcp.c
@@ -950,7 +950,8 @@
 
   hole = scoreboard_first_hole (sb);
   if (hole)
-    s = format (s, "\n head %u tail %u holes:\n", sb->head, sb->tail);
+    s = format (s, "\n head %u tail %u %u holes:\n", sb->head, sb->tail,
+		pool_elts (sb->holes));
 
   while (hole)
     {
@@ -1027,7 +1028,7 @@
 {
   int snd_space, snt_limited;
 
-  if (PREDICT_TRUE (tcp_in_cong_recovery (tc) == 0))
+  if (PREDICT_TRUE (!tcp_in_fastrecovery (tc)))
     {
       snd_space = tcp_available_output_snd_space (tc);
 
@@ -1047,16 +1048,6 @@
       return tcp_round_snd_space (tc, snd_space);
     }
 
-  if (tcp_in_recovery (tc))
-    {
-      tc->snd_nxt = tc->snd_una_max;
-      snd_space = tcp_available_snd_wnd (tc) - tc->snd_rxt_bytes
-	- (tc->snd_una_max - tc->snd_congestion);
-      if (snd_space <= 0 || (tc->snd_una_max - tc->snd_una) >= tc->snd_wnd)
-	return 0;
-      return tcp_round_snd_space (tc, snd_space);
-    }
-
   /* RFC 5681: When previously unsent data is available and the new value of
    * cwnd and the receiver's advertised window allow, a TCP SHOULD send 1*SMSS
    * bytes of previously unsent data. */
@@ -1103,6 +1094,7 @@
   tw_timer_expire_timers_16t_2w_512sl (&tcp_main.
 				       wrk_ctx[thread_index].timer_wheel,
 				       now);
+  tcp_do_fastretransmits (thread_index);
   tcp_flush_frames_to_output (thread_index);
 }
 
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index 165659b..a036072 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -120,6 +120,8 @@
   _(FR_1_SMSS, "Sent 1 SMSS")			\
   _(HALF_OPEN_DONE, "Half-open completed")	\
   _(FINPNDG, "FIN pending")			\
+  _(FRXT_PENDING, "Fast-retransmit pending")	\
+  _(FRXT_FIRST, "Fast-retransmit first again")	\
 
 typedef enum _tcp_connection_flag_bits
 {
@@ -345,6 +347,9 @@
 #define tcp_fastrecovery_sent_1_smss(tc) ((tc)->flags & TCP_CONN_FR_1_SMSS)
 #define tcp_fastrecovery_1_smss_on(tc) ((tc)->flags |= TCP_CONN_FR_1_SMSS)
 #define tcp_fastrecovery_1_smss_off(tc) ((tc)->flags &= ~TCP_CONN_FR_1_SMSS)
+#define tcp_fastrecovery_first(tc) ((tc)->flags & TCP_CONN_FRXT_FIRST)
+#define tcp_fastrecovery_first_on(tc) ((tc)->flags |= TCP_CONN_FRXT_FIRST)
+#define tcp_fastrecovery_first_off(tc) ((tc)->flags &= ~TCP_CONN_FRXT_FIRST)
 
 #define tcp_in_cong_recovery(tc) ((tc)->flags & 		\
 	  (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY))
@@ -354,6 +359,7 @@
 {
   tc->flags &= ~(TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY);
   tcp_fastrecovery_1_smss_off (tc);
+  tcp_fastrecovery_first_off (tc);
 }
 
 typedef enum _tcp_error
@@ -379,9 +385,15 @@
 						     output nodes */
   vlib_frame_t *ip_lookup_tx_frames[2];		/**< tx frames for ip 4/6
 						     lookup nodes */
+  u32 *pending_fast_rxt;			/**< vector of connections
+						     needing fast rxt */
+  u32 *ongoing_fast_rxt;			/**< vector of connections
+						     now doing fast rxt */
+
     CLIB_CACHE_LINE_ALIGN_MARK (cacheline1);
   u8 cached_opts[40];				/**< cached 'on the wire'
 						     options for bursts */
+
 } tcp_worker_ctx_t;
 
 typedef struct _tcp_main
@@ -542,6 +554,8 @@
 void tcp_update_rto (tcp_connection_t * tc);
 void tcp_flush_frame_to_output (vlib_main_t * vm, u8 thread_index, u8 is_ip4);
 void tcp_flush_frames_to_output (u8 thread_index);
+void tcp_program_fastretransmit (tcp_connection_t * tc);
+void tcp_do_fastretransmits (u32 thread_index);
 
 always_inline u32
 tcp_end_seq (tcp_header_t * th, u32 len)
@@ -659,10 +673,10 @@
 }
 
 u32 tcp_snd_space (tcp_connection_t * tc);
-void tcp_retransmit_first_unacked (tcp_connection_t * tc);
-void tcp_fast_retransmit_no_sack (tcp_connection_t * tc);
-void tcp_fast_retransmit_sack (tcp_connection_t * tc);
-void tcp_fast_retransmit (tcp_connection_t * tc);
+int tcp_retransmit_first_unacked (tcp_connection_t * tc);
+int tcp_fast_retransmit_no_sack (tcp_connection_t * tc, u32 burst_size);
+int tcp_fast_retransmit_sack (tcp_connection_t * tc, u32 burst_size);
+int tcp_fast_retransmit (tcp_connection_t * tc, u32 burst_size);
 void tcp_cc_init_congestion (tcp_connection_t * tc);
 void tcp_cc_fastrecovery_exit (tcp_connection_t * tc);
 
diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h
index ccf12da..8f626b1 100755
--- a/src/vnet/tcp/tcp_debug.h
+++ b/src/vnet/tcp/tcp_debug.h
@@ -629,6 +629,8 @@
 
 #define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...)			\
 {									\
+  if (_tc->snd_una != _tc->iss)						\
+    TCP_EVT_CC_STAT_PRINT (_tc);					\
   ELOG_TYPE_DECLARE (_e) =						\
   {									\
     .format = "cc: %s snd_space %u snd_una %u out %u flight %u",	\
@@ -788,9 +790,11 @@
 
 #define STATS_INTERVAL 1
 
-#define TCP_EVT_CC_RTO_STAT_HANDLER(_tc, ...)				\
-{									\
-if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now())		\
+#define tcp_cc_time_to_print_stats(_tc)					\
+  _tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now() 		\
+  || tcp_in_fastrecovery (_tc)						\
+
+#define TCP_EVT_CC_RTO_STAT_PRINT(_tc)					\
 {									\
   ELOG_TYPE_DECLARE (_e) =						\
   {									\
@@ -801,29 +805,40 @@
   ed->data[0] = _tc->rto;						\
   ed->data[1] = _tc->srtt;						\
   ed->data[2] = _tc->rttvar;						\
+}
+
+#define TCP_EVT_CC_RTO_STAT_HANDLER(_tc, ...)				\
+{									\
+if (tcp_cc_time_to_print_stats (_tc))					\
+{									\
+  TCP_EVT_CC_RTO_STAT_PRINT (_tc);					\
 }									\
 }
-#define TCP_EVT_CC_SND_STAT_HANDLER(_tc, ...)				\
-{									\
-if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now())		\
+
+#define TCP_EVT_CC_SND_STAT_PRINT(_tc)					\
 {									\
   ELOG_TYPE_DECLARE (_e) =						\
   {									\
-    .format = "snd_stat: dack %u sacked %u lost %u out %u rxt %u",	\
+    .format = "snd_stat: cc_space %u sacked %u lost %u out %u rxt %u",	\
     .format_args = "i4i4i4i4i4",					\
   };									\
   DECLARE_ETD(_tc, _e, 5);						\
-  ed->data[0] = _tc->rcv_dupacks;					\
+  ed->data[0] = tcp_available_cc_snd_space (_tc);			\
   ed->data[1] = _tc->sack_sb.sacked_bytes;				\
   ed->data[2] = _tc->sack_sb.lost_bytes;				\
   ed->data[3] = tcp_bytes_out (_tc);					\
   ed->data[3] = _tc->snd_rxt_bytes;					\
+}
+
+#define TCP_EVT_CC_SND_STAT_HANDLER(_tc, ...)				\
+{									\
+if (tcp_cc_time_to_print_stats (_tc))					\
+{									\
+    TCP_EVT_CC_SND_STAT_PRINT(_tc);					\
 }									\
 }
 
-#define TCP_EVT_CC_STAT_HANDLER(_tc, ...)				\
-{									\
-if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now())		\
+#define TCP_EVT_CC_STAT_PRINT(_tc)					\
 {									\
   ELOG_TYPE_DECLARE (_e) =						\
   {									\
@@ -836,7 +851,15 @@
   ed->data[2] = tcp_snd_space (_tc);					\
   ed->data[3] = _tc->ssthresh;						\
   ed->data[4] = _tc->snd_wnd;						\
-  TCP_EVT_CC_RTO_STAT_HANDLER (_tc);					\
+  TCP_EVT_CC_RTO_STAT_PRINT (_tc);					\
+  TCP_EVT_CC_SND_STAT_PRINT (_tc);					\
+}
+
+#define TCP_EVT_CC_STAT_HANDLER(_tc, ...)				\
+{									\
+if (tcp_cc_time_to_print_stats (_tc))					\
+{									\
+  TCP_EVT_CC_STAT_PRINT (_tc);						\
   _tc->c_cc_stat_tstamp = tcp_time_now();				\
 }									\
 }
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index 4e3987e..39a538b 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -749,7 +749,7 @@
       /* Rule (3): if hole not lost */
       else if (seq_lt (hole->start, sb->high_sacked))
 	{
-	  *snd_limited = 1;
+	  *snd_limited = 0;
 	  sb->cur_rxt_hole = scoreboard_hole_index (sb, hole);
 	}
       /* Rule (4): if hole beyond high_sacked */
@@ -993,10 +993,10 @@
   sb->last_sacked_bytes = sb->sacked_bytes
     - (old_sacked_bytes - sb->last_bytes_delivered);
   ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes || tcp_in_recovery (tc));
-  ASSERT (sb->sacked_bytes == 0
+  ASSERT (sb->sacked_bytes == 0 || tcp_in_recovery (tc)
 	  || sb->sacked_bytes < tc->snd_una_max - seq_max (tc->snd_una, ack));
   ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_una_max
-	  - seq_max (tc->snd_una, ack));
+	  - seq_max (tc->snd_una, ack) || tcp_in_recovery (tc));
   ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc)
 	  || sb->holes[sb->head].start == ack + sb->snd_una_adv);
   TCP_EVT_DBG (TCP_EVT_CC_SCOREBOARD, tc);
@@ -1052,6 +1052,9 @@
   tcp_fastrecovery_on (tc);
   tc->snd_congestion = tc->snd_una_max;
   tc->cwnd_acc_bytes = 0;
+  tc->snd_rxt_bytes = 0;
+  tc->prev_ssthresh = tc->ssthresh;
+  tc->prev_cwnd = tc->cwnd;
   tc->cc_algo->congestion (tc);
   TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 4);
 }
@@ -1074,8 +1077,14 @@
   tc->snd_rxt_bytes = 0;
   tc->rcv_dupacks = 0;
   tc->snd_nxt = tc->snd_una_max;
+  tc->snd_rxt_bytes = 0;
+
+  /* HACK: since we don't have an output pacer, force slow start */
+  tc->cwnd = 20 * tc->snd_mss;
+
   tcp_fastrecovery_off (tc);
   tcp_fastrecovery_1_smss_off (tc);
+  tcp_fastrecovery_first_off (tc);
   TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3);
 }
 
@@ -1088,13 +1097,14 @@
   tc->rcv_dupacks = 0;
   if (tcp_in_recovery (tc))
     tcp_cc_recovery_exit (tc);
+  else if (tcp_in_fastrecovery (tc))
+    tcp_cc_fastrecovery_exit (tc);
   ASSERT (tc->rto_boff == 0);
   TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 5);
-  /* TODO extend for fastrecovery */
 }
 
-static u8
-tcp_cc_is_spurious_retransmit (tcp_connection_t * tc)
+static inline u8
+tcp_cc_is_spurious_timeout_rxt (tcp_connection_t * tc)
 {
   return (tcp_in_recovery (tc) && tc->rto_boff == 1
 	  && tc->snd_rxt_ts
@@ -1102,6 +1112,20 @@
 	  && timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts));
 }
 
+static inline u8
+tcp_cc_is_spurious_fast_rxt (tcp_connection_t * tc)
+{
+  return (tcp_in_fastrecovery (tc)
+	  && tc->cwnd > tc->ssthresh + 3 * tc->snd_mss);
+}
+
+static u8
+tcp_cc_is_spurious_retransmit (tcp_connection_t * tc)
+{
+  return (tcp_cc_is_spurious_timeout_rxt (tc)
+	  || tcp_cc_is_spurious_fast_rxt (tc));
+}
+
 static int
 tcp_cc_recover (tcp_connection_t * tc)
 {
@@ -1158,6 +1182,84 @@
 	  || tcp_should_fastrecover_sack (tc));
 }
 
+void
+tcp_program_fastretransmit (tcp_connection_t * tc)
+{
+  tcp_worker_ctx_t *wrk = &tcp_main.wrk_ctx[tc->c_thread_index];
+  if (!(tc->flags & TCP_CONN_FRXT_PENDING))
+    {
+      vec_add1 (wrk->pending_fast_rxt, tc->c_c_index);
+      tc->flags |= TCP_CONN_FRXT_PENDING;
+    }
+}
+
+void
+tcp_do_fastretransmits (u32 thread_index)
+{
+  tcp_worker_ctx_t *wrk = &tcp_main.wrk_ctx[thread_index];
+  u32 max_burst_size, burst_size, n_segs = 0;
+  tcp_connection_t *tc;
+  int i;
+
+  if (vec_len (wrk->pending_fast_rxt) == 0)
+    return;
+
+  vec_append (wrk->ongoing_fast_rxt, wrk->pending_fast_rxt);
+  vec_reset_length (wrk->pending_fast_rxt);
+
+  max_burst_size = VLIB_FRAME_SIZE / vec_len (wrk->ongoing_fast_rxt);
+  max_burst_size = clib_max (max_burst_size, 1);
+
+  for (i = 0; i < vec_len (wrk->ongoing_fast_rxt); i++)
+    {
+      tc = tcp_connection_get (wrk->ongoing_fast_rxt[i], thread_index);
+      tc->flags &= ~TCP_CONN_FRXT_PENDING;
+
+      if (!tcp_in_fastrecovery (tc))
+	continue;
+
+      /* TODO tx pacer instead of this */
+      if (n_segs >= VLIB_FRAME_SIZE)
+	{
+	  tcp_program_fastretransmit (tc);
+	  continue;
+	}
+
+      burst_size = clib_min (max_burst_size, VLIB_FRAME_SIZE - n_segs);
+
+      if (tc->cwnd > tc->ssthresh + 3 * tc->snd_mss)
+	{
+	  /* The first segment MUST be retransmitted */
+	  if (tcp_retransmit_first_unacked (tc))
+	    {
+	      tcp_program_fastretransmit (tc);
+	      continue;
+	    }
+
+	  /* Post retransmit update cwnd to ssthresh and account for the
+	   * three segments that have left the network and should've been
+	   * buffered at the receiver XXX */
+	  tc->cwnd = tc->ssthresh + 3 * tc->snd_mss;
+
+	  /* If cwnd allows, send more data */
+	  if (tcp_opts_sack_permitted (&tc->rcv_opts))
+	    {
+	      scoreboard_init_high_rxt (&tc->sack_sb,
+					tc->snd_una + tc->snd_mss);
+	      tc->sack_sb.rescue_rxt = tc->snd_una - 1;
+	      n_segs += tcp_fast_retransmit_sack (tc, burst_size);
+	    }
+	  else
+	    {
+	      n_segs += tcp_fast_retransmit_no_sack (tc, burst_size);
+	    }
+	}
+      else
+	n_segs += tcp_fast_retransmit (tc, burst_size);
+    }
+  vec_reset_length (wrk->ongoing_fast_rxt);
+}
+
 /**
  * One function to rule them all ... and in the darkness bind them
  */
@@ -1170,7 +1272,7 @@
     {
       if (tc->bytes_acked)
 	goto partial_ack;
-      tcp_fast_retransmit (tc);
+      tcp_program_fastretransmit (tc);
       return;
     }
   /*
@@ -1196,20 +1298,10 @@
 	{
 	  ASSERT (!tcp_in_fastrecovery (tc));
 
-	  /* If of of the two conditions lower hold, reset dupacks because
-	   * we're probably after timeout (RFC6582 heuristics).
-	   * If Cumulative ack does not cover more than congestion threshold,
-	   * and:
-	   * 1) The following doesn't hold: The congestion window is greater
-	   *    than SMSS bytes and the difference between highest_ack
-	   *    and prev_highest_ack is at most 4*SMSS bytes
-	   * 2) Echoed timestamp in the last non-dup ack does not equal the
-	   *    stored timestamp
-	   */
-	  if (seq_leq (tc->snd_una, tc->snd_congestion)
-	      && ((!(tc->cwnd > tc->snd_mss
-		     && tc->bytes_acked <= 4 * tc->snd_mss))
-		  || (tc->rcv_opts.tsecr != tc->tsecr_last_ack)))
+	  /* Heuristic to catch potential late dupacks
+	   * after fast retransmit exits */
+	  if (is_dack && tc->snd_una == tc->snd_congestion
+	      && timestamp_leq (tc->rcv_opts.tsecr, tc->tsecr_last_ack))
 	    {
 	      tc->rcv_dupacks = 0;
 	      return;
@@ -1218,26 +1310,10 @@
 	  tcp_cc_init_congestion (tc);
 	  tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
 
-	  /* The first segment MUST be retransmitted */
-	  tcp_retransmit_first_unacked (tc);
-
-	  /* Post retransmit update cwnd to ssthresh and account for the
-	   * three segments that have left the network and should've been
-	   * buffered at the receiver XXX */
-	  tc->cwnd = tc->ssthresh + tc->rcv_dupacks * tc->snd_mss;
-	  ASSERT (tc->cwnd >= tc->snd_mss);
-
-	  /* If cwnd allows, send more data */
 	  if (tcp_opts_sack_permitted (&tc->rcv_opts))
-	    {
-	      scoreboard_init_high_rxt (&tc->sack_sb,
-					tc->snd_una + tc->snd_mss);
-	      tcp_fast_retransmit_sack (tc);
-	    }
-	  else
-	    {
-	      tcp_fast_retransmit_no_sack (tc);
-	    }
+	    tc->sack_sb.high_rxt = tc->snd_una;
+
+	  tcp_program_fastretransmit (tc);
 	  return;
 	}
       else if (!tc->bytes_acked
@@ -1249,6 +1325,28 @@
       else
 	goto partial_ack;
     }
+  /* Don't allow entry in fast recovery if still in recovery, for now */
+  else if (0 && is_dack && tcp_in_recovery (tc))
+    {
+      /* If of of the two conditions lower hold, reset dupacks because
+       * we're probably after timeout (RFC6582 heuristics).
+       * If Cumulative ack does not cover more than congestion threshold,
+       * and:
+       * 1) The following doesn't hold: The congestion window is greater
+       *    than SMSS bytes and the difference between highest_ack
+       *    and prev_highest_ack is at most 4*SMSS bytes
+       * 2) Echoed timestamp in the last non-dup ack does not equal the
+       *    stored timestamp
+       */
+      if (seq_leq (tc->snd_una, tc->snd_congestion)
+	  && ((!(tc->cwnd > tc->snd_mss
+		 && tc->bytes_acked <= 4 * tc->snd_mss))
+	      || (tc->rcv_opts.tsecr != tc->tsecr_last_ack)))
+	{
+	  tc->rcv_dupacks = 0;
+	  return;
+	}
+    }
 
   if (!tc->bytes_acked)
     return;
@@ -1259,14 +1357,11 @@
   /*
    * Legitimate ACK. 1) See if we can exit recovery
    */
-  /* XXX limit this only to first partial ack? */
-  if (seq_lt (tc->snd_una, tc->snd_congestion))
-    tcp_retransmit_timer_force_update (tc);
-  else
-    tcp_retransmit_timer_update (tc);
 
   if (seq_geq (tc->snd_una, tc->snd_congestion))
     {
+      tcp_retransmit_timer_update (tc);
+
       /* If spurious return, we've already updated everything */
       if (tcp_cc_recover (tc))
 	{
@@ -1286,6 +1381,9 @@
    * Legitimate ACK. 2) If PARTIAL ACK try to retransmit
    */
 
+  /* XXX limit this only to first partial ack? */
+  tcp_retransmit_timer_force_update (tc);
+
   /* RFC6675: If the incoming ACK is a cumulative acknowledgment,
    * reset dupacks to 0. Also needed if in congestion recovery */
   tc->rcv_dupacks = 0;
@@ -1300,24 +1398,33 @@
     }
 
   /* Remove retransmitted bytes that have been delivered */
-  ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv
-	  >= tc->sack_sb.last_bytes_delivered
-	  || (tc->flags & TCP_CONN_FINSNT));
-
-  if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt))
+  if (tcp_opts_sack_permitted (&tc->rcv_opts))
     {
+      ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv
+	      >= tc->sack_sb.last_bytes_delivered
+	      || (tc->flags & TCP_CONN_FINSNT));
+
       /* If we have sacks and we haven't gotten an ack beyond high_rxt,
        * remove sacked bytes delivered */
-      rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv
-	- tc->sack_sb.last_bytes_delivered;
-      ASSERT (tc->snd_rxt_bytes >= rxt_delivered);
-      tc->snd_rxt_bytes -= rxt_delivered;
+      if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt))
+	{
+	  rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv
+	    - tc->sack_sb.last_bytes_delivered;
+	  ASSERT (tc->snd_rxt_bytes >= rxt_delivered);
+	  tc->snd_rxt_bytes -= rxt_delivered;
+	}
+      else
+	{
+	  /* Apparently all retransmitted holes have been acked */
+	  tc->snd_rxt_bytes = 0;
+	}
     }
   else
     {
-      /* Either all retransmitted holes have been acked, or we're
-       * "in the blind" and retransmitting segment by segment */
-      tc->snd_rxt_bytes = 0;
+      if (tc->snd_rxt_bytes > tc->bytes_acked)
+	tc->snd_rxt_bytes -= tc->bytes_acked;
+      else
+	tc->snd_rxt_bytes = 0;
     }
 
   tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK);
@@ -1325,7 +1432,7 @@
   /*
    * Since this was a partial ack, try to retransmit some more data
    */
-  tcp_fast_retransmit (tc);
+  tcp_program_fastretransmit (tc);
 }
 
 /**
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index ed1c641..2e6036b 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -1409,7 +1409,11 @@
 
   /* Cleanly recover cc (also clears up fast retransmit) */
   if (tcp_in_fastrecovery (tc))
-    tcp_cc_fastrecovery_exit (tc);
+    {
+      /* TODO be less aggressive about this */
+      scoreboard_clear (&tc->sack_sb);
+      tcp_cc_fastrecovery_exit (tc);
+    }
 
   /* Start again from the beginning */
   tc->cc_algo->congestion (tc);
@@ -1487,6 +1491,8 @@
       /* First retransmit timeout */
       if (tc->rto_boff == 1)
 	tcp_rxt_timeout_cc (tc);
+      else
+	scoreboard_clear (&tc->sack_sb);
 
       /* If we've sent beyond snd_congestion, update it */
       if (seq_gt (tc->snd_una_max, tc->snd_congestion))
@@ -1499,9 +1505,6 @@
        * shortfall */
       n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b);
 
-      /* TODO be less aggressive about this */
-      scoreboard_clear (&tc->sack_sb);
-
       if (n_bytes == 0)
 	{
 	  tcp_retransmit_timer_force_update (tc);
@@ -1680,7 +1683,7 @@
 /**
  * Retransmit first unacked segment
  */
-void
+int
 tcp_retransmit_first_unacked (tcp_connection_t * tc)
 {
   vlib_main_t *vm = vlib_get_main ();
@@ -1691,20 +1694,23 @@
   tc->snd_nxt = tc->snd_una;
 
   TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2);
+
   n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b);
   if (!n_bytes)
-    return;
+    return -1;
+
   bi = vlib_get_buffer_index (vm, b);
   tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
-
   tc->snd_nxt = old_snd_nxt;
+
+  return 0;
 }
 
 /**
  * Do fast retransmit with SACKs
  */
-void
-tcp_fast_retransmit_sack (tcp_connection_t * tc)
+int
+tcp_fast_retransmit_sack (tcp_connection_t * tc, u32 burst_size)
 {
   vlib_main_t *vm = vlib_get_main ();
   u32 n_written = 0, offset, max_bytes, n_segs = 0;
@@ -1720,13 +1726,16 @@
   old_snd_nxt = tc->snd_nxt;
   sb = &tc->sack_sb;
   snd_space = tcp_available_cc_snd_space (tc);
+  hole = scoreboard_get_hole (sb, sb->cur_rxt_hole);
 
   if (snd_space < tc->snd_mss)
-    goto done;
+    {
+      tcp_program_fastretransmit (tc);
+      goto done;
+    }
 
   TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0);
-  hole = scoreboard_get_hole (sb, sb->cur_rxt_hole);
-  while (hole && snd_space > 0 && n_segs++ < VLIB_FRAME_SIZE)
+  while (snd_space > 0 && n_segs < burst_size)
     {
       hole = scoreboard_next_rxt_hole (sb, hole,
 				       tcp_fastrecovery_sent_1_smss (tc),
@@ -1736,7 +1745,21 @@
 	  if (!can_rescue || !(seq_lt (sb->rescue_rxt, tc->snd_una)
 			       || seq_gt (sb->rescue_rxt,
 					  tc->snd_congestion)))
-	    break;
+	    {
+	      if (tcp_fastrecovery_first (tc))
+		break;
+
+	      /* We tend to lose the first segment. Try re-resending
+	       * it but only once and after we've tried everything */
+	      hole = scoreboard_first_hole (sb);
+	      if (hole && hole->start == tc->snd_una)
+		{
+		  tcp_retransmit_first_unacked (tc);
+		  tcp_fastrecovery_first_on (tc);
+		  n_segs += 1;
+		}
+	      break;
+	    }
 
 	  /* If rescue rxt undefined or less than snd_una then one segment of
 	   * up to SMSS octets that MUST include the highest outstanding
@@ -1756,6 +1779,7 @@
 
 	  bi = vlib_get_buffer_index (vm, b);
 	  tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
+	  n_segs += 1;
 	  break;
 	}
 
@@ -1776,22 +1800,27 @@
       tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
       ASSERT (n_written <= snd_space);
       snd_space -= n_written;
+      n_segs += 1;
     }
 
+  if (hole)
+    tcp_program_fastretransmit (tc);
+
 done:
   /* If window allows, send 1 SMSS of new data */
   tc->snd_nxt = old_snd_nxt;
+  return n_segs;
 }
 
 /**
  * Fast retransmit without SACK info
  */
-void
-tcp_fast_retransmit_no_sack (tcp_connection_t * tc)
+int
+tcp_fast_retransmit_no_sack (tcp_connection_t * tc, u32 burst_size)
 {
   vlib_main_t *vm = vlib_get_main ();
   u32 n_written = 0, offset = 0, bi, old_snd_nxt;
-  int snd_space;
+  int snd_space, n_segs = 0;
   vlib_buffer_t *b;
 
   ASSERT (tcp_in_fastrecovery (tc));
@@ -1802,7 +1831,7 @@
   tc->snd_nxt = tc->snd_una;
   snd_space = tcp_available_cc_snd_space (tc);
 
-  while (snd_space > 0)
+  while (snd_space > 0 && n_segs < burst_size)
     {
       offset += n_written;
       n_written = tcp_prepare_retransmit_segment (tc, offset, snd_space, &b);
@@ -1814,22 +1843,29 @@
       bi = vlib_get_buffer_index (vm, b);
       tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
       snd_space -= n_written;
+      n_segs += 1;
     }
 
+  /* More data to resend */
+  if (seq_lt (tc->snd_nxt, tc->snd_congestion))
+    tcp_program_fastretransmit (tc);
+
   /* Restore snd_nxt. If window allows, send 1 SMSS of new data */
   tc->snd_nxt = old_snd_nxt;
+
+  return n_segs;
 }
 
 /**
  * Do fast retransmit
  */
-void
-tcp_fast_retransmit (tcp_connection_t * tc)
+int
+tcp_fast_retransmit (tcp_connection_t * tc, u32 burst_size)
 {
   if (tcp_opts_sack_permitted (&tc->rcv_opts))
-    tcp_fast_retransmit_sack (tc);
+    return tcp_fast_retransmit_sack (tc, burst_size);
   else
-    tcp_fast_retransmit_no_sack (tc);
+    return tcp_fast_retransmit_no_sack (tc, burst_size);
 }
 
 static u32