tcp: fast retransmit pacing

Force pacing for fast retransmit to avoid bursts of retransmitted
packets.

Change-Id: I2ff42c328899b36322c4de557b1f7d853dba8fe2
Signed-off-by: Florin Coras <fcoras@cisco.com>
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
index 8c0c5da..bdf9c7a 100644
--- a/src/vnet/tcp/tcp.c
+++ b/src/vnet/tcp/tcp.c
@@ -558,10 +558,11 @@
 void
 tcp_enable_pacing (tcp_connection_t * tc)
 {
-  u32 max_burst, byte_rate;
-  max_burst = 16 * tc->snd_mss;
+  u32 initial_bucket, byte_rate;
+  initial_bucket = 16 * tc->snd_mss;
   byte_rate = 2 << 16;
-  transport_connection_tx_pacer_init (&tc->connection, byte_rate, max_burst);
+  transport_connection_tx_pacer_init (&tc->connection, byte_rate,
+				      initial_bucket);
   tc->mrtt_us = (u32) ~ 0;
 }
 
@@ -1318,6 +1319,7 @@
 
   num_threads = 1 /* main thread */  + vtm->n_threads;
   vec_validate (tm->connections, num_threads - 1);
+  vec_validate (tm->wrk_ctx, num_threads - 1);
 
   /*
    * Preallocate connections. Assume that thread 0 won't
@@ -1339,6 +1341,13 @@
       if (preallocated_connections_per_thread)
 	pool_init_fixed (tm->connections[thread],
 			 preallocated_connections_per_thread);
+      vec_validate (tm->wrk_ctx[thread].pending_fast_rxt, 0);
+      vec_validate (tm->wrk_ctx[thread].ongoing_fast_rxt, 0);
+      vec_validate (tm->wrk_ctx[thread].postponed_fast_rxt, 0);
+      vec_reset_length (tm->wrk_ctx[thread].pending_fast_rxt);
+      vec_reset_length (tm->wrk_ctx[thread].ongoing_fast_rxt);
+      vec_reset_length (tm->wrk_ctx[thread].postponed_fast_rxt);
+      tm->wrk_ctx[thread].vm = vlib_mains[thread];
     }
 
   /*
@@ -1358,7 +1367,6 @@
       clib_spinlock_init (&tm->half_open_lock);
     }
 
-  vec_validate (tm->wrk_ctx, num_threads - 1);
   tcp_initialize_timer_wheels (tm);
 
   tm->bytes_per_buffer = vlib_buffer_free_list_buffer_size
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index 4ba3d5e..f7424c3 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -160,7 +160,7 @@
 };
 
 #define TCP_SCOREBOARD_TRACE (0)
-#define TCP_MAX_SACK_BLOCKS 32	/**< Max number of SACK blocks stored */
+#define TCP_MAX_SACK_BLOCKS 256	/**< Max number of SACK blocks stored */
 #define TCP_INVALID_SACK_HOLE_INDEX ((u32)~0)
 
 typedef struct _scoreboard_trace_elt
@@ -390,6 +390,9 @@
 						     needing fast rxt */
   u32 *ongoing_fast_rxt;			/**< vector of connections
 						     now doing fast rxt */
+  u32 *postponed_fast_rxt;			/**< vector of connections
+						     that will do fast rxt */
+  vlib_main_t *vm;				/**< pointer to vm */
 
     CLIB_CACHE_LINE_ALIGN_MARK (cacheline1);
   u8 cached_opts[40];				/**< cached 'on the wire'
@@ -722,8 +725,8 @@
 tcp_cc_rcv_ack (tcp_connection_t * tc)
 {
   tc->cc_algo->rcv_ack (tc);
-  tcp_update_pacer (tc);
   tc->tsecr_last_ack = tc->rcv_opts.tsecr;
+  tcp_update_pacer (tc);
 }
 
 always_inline void
diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h
index 8f626b1..cd4a6f0 100755
--- a/src/vnet/tcp/tcp_debug.h
+++ b/src/vnet/tcp/tcp_debug.h
@@ -627,10 +627,8 @@
 
 #if TCP_DEBUG_CC
 
-#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...)			\
+#define TCP_EVT_CC_EVT_PRINT(_tc, _sub_evt)				\
 {									\
-  if (_tc->snd_una != _tc->iss)						\
-    TCP_EVT_CC_STAT_PRINT (_tc);					\
   ELOG_TYPE_DECLARE (_e) =						\
   {									\
     .format = "cc: %s snd_space %u snd_una %u out %u flight %u",	\
@@ -638,8 +636,8 @@
     .n_enum_strings = 7,						\
     .enum_strings = {                                           	\
       "fast-rxt",	                                             	\
-      "rxt-timeout",                                                 	\
       "first-rxt",                                                 	\
+      "rxt-timeout",                                                 	\
       "recovered",							\
       "congestion",							\
       "undo",								\
@@ -653,8 +651,18 @@
   ed->data[3] = tcp_bytes_out(_tc);					\
   ed->data[4] = tcp_flight_size (_tc);					\
 }
+
+#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...)			\
+{									\
+  if (_tc->snd_una != _tc->iss)						\
+    TCP_EVT_CC_STAT_PRINT (_tc);					\
+  if ((_sub_evt <= 1 && TCP_DEBUG_CC > 1)				\
+      || (_sub_evt > 1 && TCP_DEBUG_CC > 0))				\
+      TCP_EVT_CC_EVT_PRINT (_tc, _sub_evt);				\
+}
 #else
-#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...)
+#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...)			\
+
 #endif
 
 #if TCP_DEBUG_CC > 1
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index ac0e996..154b9ac 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -1199,67 +1199,60 @@
 tcp_do_fastretransmits (u32 thread_index)
 {
   tcp_worker_ctx_t *wrk = &tcp_main.wrk_ctx[thread_index];
-  u32 max_burst_size, burst_size, n_segs = 0;
+  u32 max_burst_size, burst_size, n_segs = 0, n_segs_now;
+  u32 *ongoing_fast_rxt, burst_bytes, sent_bytes;
   tcp_connection_t *tc;
+  u64 last_cpu_time;
   int i;
 
-  if (vec_len (wrk->pending_fast_rxt) == 0)
+  if (vec_len (wrk->pending_fast_rxt) == 0
+      && vec_len (wrk->postponed_fast_rxt) == 0)
     return;
 
-  vec_append (wrk->ongoing_fast_rxt, wrk->pending_fast_rxt);
-  vec_reset_length (wrk->pending_fast_rxt);
+  last_cpu_time = wrk->vm->clib_time.last_cpu_time;
+  ongoing_fast_rxt = wrk->ongoing_fast_rxt;
+  vec_append (ongoing_fast_rxt, wrk->postponed_fast_rxt);
+  vec_append (ongoing_fast_rxt, wrk->pending_fast_rxt);
+
+  _vec_len (wrk->postponed_fast_rxt) = 0;
+  _vec_len (wrk->pending_fast_rxt) = 0;
 
   max_burst_size = VLIB_FRAME_SIZE / vec_len (wrk->ongoing_fast_rxt);
   max_burst_size = clib_max (max_burst_size, 1);
 
-  for (i = 0; i < vec_len (wrk->ongoing_fast_rxt); i++)
+  for (i = 0; i < vec_len (ongoing_fast_rxt); i++)
     {
-      tc = tcp_connection_get (wrk->ongoing_fast_rxt[i], thread_index);
+      if (n_segs >= VLIB_FRAME_SIZE)
+	{
+	  vec_add1 (wrk->postponed_fast_rxt, ongoing_fast_rxt[i]);
+	  continue;
+	}
+
+      tc = tcp_connection_get (ongoing_fast_rxt[i], thread_index);
       tc->flags &= ~TCP_CONN_FRXT_PENDING;
 
       if (!tcp_in_fastrecovery (tc))
 	continue;
 
-      /* TODO tx pacer instead of this */
-      if (n_segs >= VLIB_FRAME_SIZE)
+      burst_size = clib_min (max_burst_size, VLIB_FRAME_SIZE - n_segs);
+      burst_bytes = transport_connection_tx_pacer_burst (&tc->connection,
+							 last_cpu_time);
+      burst_size = clib_min (burst_size, burst_bytes / tc->snd_mss);
+      if (!burst_size)
 	{
 	  tcp_program_fastretransmit (tc);
 	  continue;
 	}
 
-      burst_size = clib_min (max_burst_size, VLIB_FRAME_SIZE - n_segs);
+      n_segs_now = tcp_fast_retransmit (tc, burst_size);
+      sent_bytes = clib_min (n_segs_now * tc->snd_mss, burst_bytes);
+      transport_connection_tx_pacer_update_bytes (&tc->connection,
+						  sent_bytes);
 
-      if (tc->cwnd > tc->ssthresh + 3 * tc->snd_mss)
-	{
-	  /* The first segment MUST be retransmitted */
-	  if (tcp_retransmit_first_unacked (tc))
-	    {
-	      tcp_program_fastretransmit (tc);
-	      continue;
-	    }
-
-	  /* Post retransmit update cwnd to ssthresh and account for the
-	   * three segments that have left the network and should've been
-	   * buffered at the receiver XXX */
-	  tc->cwnd = tc->ssthresh + 3 * tc->snd_mss;
-
-	  /* If cwnd allows, send more data */
-	  if (tcp_opts_sack_permitted (&tc->rcv_opts))
-	    {
-	      scoreboard_init_high_rxt (&tc->sack_sb,
-					tc->snd_una + tc->snd_mss);
-	      tc->sack_sb.rescue_rxt = tc->snd_una - 1;
-	      n_segs += tcp_fast_retransmit_sack (tc, burst_size);
-	    }
-	  else
-	    {
-	      n_segs += tcp_fast_retransmit_no_sack (tc, burst_size);
-	    }
-	}
-      else
-	n_segs += tcp_fast_retransmit (tc, burst_size);
+      n_segs += n_segs_now;
     }
-  vec_reset_length (wrk->ongoing_fast_rxt);
+  _vec_len (ongoing_fast_rxt) = 0;
+  wrk->ongoing_fast_rxt = ongoing_fast_rxt;
 }
 
 /**
@@ -1298,6 +1291,7 @@
 	}
       else if (tcp_should_fastrecover (tc))
 	{
+	  u32 byte_rate;
 	  ASSERT (!tcp_in_fastrecovery (tc));
 
 	  /* Heuristic to catch potential late dupacks
@@ -1313,8 +1307,21 @@
 	  tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
 
 	  if (tcp_opts_sack_permitted (&tc->rcv_opts))
-	    tc->sack_sb.high_rxt = tc->snd_una;
+	    {
+	      tc->cwnd = tc->ssthresh;
+	      scoreboard_init_high_rxt (&tc->sack_sb, tc->snd_una);
+	      tc->sack_sb.rescue_rxt = tc->snd_una - 1;
+	    }
+	  else
+	    {
+	      /* Post retransmit update cwnd to ssthresh and account for the
+	       * three segments that have left the network and should've been
+	       * buffered at the receiver XXX */
+	      tc->cwnd = tc->ssthresh + 3 * tc->snd_mss;
+	    }
 
+	  byte_rate = (0.3 * tc->cwnd) / ((f64) TCP_TICK * tc->srtt);
+	  transport_connection_tx_pacer_init (&tc->connection, byte_rate, 0);
 	  tcp_program_fastretransmit (tc);
 	  return;
 	}
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index f14a612..81579ef 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -1452,10 +1452,10 @@
       tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID;
     }
 
-  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1);
-
   if (tc->state >= TCP_STATE_ESTABLISHED)
     {
+      TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2);
+
       /* Lost FIN, retransmit and return */
       if (tcp_is_lost_fin (tc))
 	{
@@ -1536,6 +1536,8 @@
 	  return;
 	}
 
+      TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2);
+
       /* Try without increasing RTO a number of times. If this fails,
        * start growing RTO exponentially */
       tc->rto_boff += 1;
@@ -1562,6 +1564,8 @@
   /* Retransmit SYN-ACK */
   else if (tc->state == TCP_STATE_SYN_RCVD)
     {
+      TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2);
+
       tc->rto_boff += 1;
       if (tc->rto_boff > TCP_RTO_SYN_RETRIES)
 	tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
@@ -1693,7 +1697,7 @@
   old_snd_nxt = tc->snd_nxt;
   tc->snd_nxt = tc->snd_una;
 
-  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2);
+  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1);
 
   n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b);
   if (!n_bytes)