tcp: pacer and mrtt estimation improvements

- update pacer once per burst
- better estimate initial rtt
- compute smoothed average for higher precision rtt estimate

Change-Id: I06d41a98784cdf861bedfbee2e7d0afc0d0154ef
Signed-off-by: Florin Coras <fcoras@cisco.com>
diff --git a/src/vnet/session/session_node.c b/src/vnet/session/session_node.c
index 64c873c..22d8d3c 100644
--- a/src/vnet/session/session_node.c
+++ b/src/vnet/session/session_node.c
@@ -99,7 +99,7 @@
   s = session_get_if_valid (index, thread_index);
   if (!s)
     {
-      clib_warning ("Invalid session!");
+      SESSION_DBG ("Invalid session!");
       return;
     }
   app_wrk = app_worker_get (s->app_wrk_index);
@@ -751,7 +751,7 @@
 session_update_dispatch_period (session_manager_worker_t * wrk, f64 now,
 				u32 thread_index)
 {
-  if (wrk->last_tx_packets > 1)
+  if (wrk->last_tx_packets)
     {
       f64 sample = now - wrk->last_vlib_time;
       wrk->dispatch_period = (wrk->dispatch_period + sample) * 0.5;
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
index ea350dd..d759cf0 100644
--- a/src/vnet/tcp/tcp.c
+++ b/src/vnet/tcp/tcp.c
@@ -802,9 +802,10 @@
 	      tcp_rcv_wnd_available (tc));
   s = format (s, " tsval_recent %u tsval_recent_age %u\n", tc->tsval_recent,
 	      tcp_time_now () - tc->tsval_recent_age);
-  s = format (s, " rto %u rto_boff %u srtt %u rttvar %u rtt_ts %2.5f ",
-	      tc->rto, tc->rto_boff, tc->srtt, tc->rttvar, tc->rtt_ts);
-  s = format (s, "rtt_seq %u\n", tc->rtt_seq - tc->iss);
+  s = format (s, " rto %u rto_boff %u srtt %u us %.3f rttvar %u rtt_ts %x",
+	      tc->rto, tc->rto_boff, tc->srtt, tc->mrtt_us * 1000, tc->rttvar,
+	      tc->rtt_ts);
+  s = format (s, " rtt_seq %u\n", tc->rtt_seq - tc->iss);
   s = format (s, " cong:   %U", format_tcp_congestion, tc);
 
   if (tc->state >= TCP_STATE_ESTABLISHED)
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index 5a3a965..843b90d 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -753,7 +753,6 @@
 {
   tc->cc_algo->rcv_ack (tc);
   tc->tsecr_last_ack = tc->rcv_opts.tsecr;
-  tcp_connection_tx_pacer_update (tc);
 }
 
 always_inline void
diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h
index cd4a6f0..d125ee8 100755
--- a/src/vnet/tcp/tcp_debug.h
+++ b/src/vnet/tcp/tcp_debug.h
@@ -806,13 +806,14 @@
 {									\
   ELOG_TYPE_DECLARE (_e) =						\
   {									\
-    .format = "rcv_stat: rto %u srtt %u rttvar %u ",			\
-    .format_args = "i4i4i4",						\
+    .format = "rcv_stat: rto %u srtt %u mrtt-us %u rttvar %u",		\
+    .format_args = "i4i4i4i4",						\
   };									\
-  DECLARE_ETD(_tc, _e, 3);						\
+  DECLARE_ETD(_tc, _e, 4);						\
   ed->data[0] = _tc->rto;						\
   ed->data[1] = _tc->srtt;						\
-  ed->data[2] = _tc->rttvar;						\
+  ed->data[2] = (u32) (_tc->mrtt_us * 1e6);				\
+  ed->data[3] = _tc->rttvar;	 					\
 }
 
 #define TCP_EVT_CC_RTO_STAT_HANDLER(_tc, ...)				\
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index 9c303eb..0f1ab1a 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -455,8 +455,11 @@
 
   if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq))
     {
-      tc->mrtt_us = tcp_time_now_us (tc->c_thread_index) - tc->rtt_ts;
-      mrtt = clib_max ((u32) (tc->mrtt_us * THZ), 1);
+      f64 sample = tcp_time_now_us (tc->c_thread_index) - tc->rtt_ts;
+      tc->mrtt_us = tc->mrtt_us + (sample - tc->mrtt_us) * 0.125;
+      mrtt = clib_max ((u32) (sample * THZ), 1);
+      /* Allow measuring of a new RTT */
+      tc->rtt_ts = 0;
     }
   /* As per RFC7323 TSecr can be used for RTTM only if the segment advances
    * snd_una, i.e., the left side of the send window:
@@ -475,9 +478,6 @@
 
 done:
 
-  /* Allow measuring of a new RTT */
-  tc->rtt_ts = 0;
-
   /* If we got here something must've been ACKed so make sure boff is 0,
    * even if mrtt is not valid since we update the rto lower */
   tc->rto_boff = 0;
@@ -486,6 +486,29 @@
   return 0;
 }
 
+static void
+tcp_estimate_initial_rtt (tcp_connection_t * tc)
+{
+  u8 thread_index = vlib_num_workers ()? 1 : 0;
+  int mrtt;
+
+  if (tc->rtt_ts)
+    {
+      tc->mrtt_us = tcp_time_now_us (thread_index) - tc->rtt_ts;
+      mrtt = clib_max ((u32) (tc->mrtt_us * THZ), 1);
+      tc->rtt_ts = 0;
+    }
+  else
+    {
+      mrtt = tcp_time_now_w_thread (thread_index) - tc->rcv_opts.tsecr;
+      tc->mrtt_us = (f64) mrtt *TCP_TICK;
+
+    }
+
+  if (mrtt > 0 && mrtt < TCP_RTT_MAX)
+    tcp_estimate_rtt (tc, mrtt);
+}
+
 /**
  * Dequeue bytes for connections that have received acks in last burst
  */
@@ -506,6 +529,9 @@
       tc = tcp_connection_get (pending_deq_acked[i], thread_index);
       tc->flags &= ~TCP_CONN_DEQ_PENDING;
 
+      if (PREDICT_FALSE (!tc->burst_acked))
+	continue;
+
       /* Dequeue the newly ACKed bytes */
       stream_session_dequeue_drop (&tc->connection, tc->burst_acked);
       tc->burst_acked = 0;
@@ -514,6 +540,11 @@
       /* If everything has been acked, stop retransmit timer
        * otherwise update. */
       tcp_retransmit_timer_update (tc);
+
+      /* If not congested, update pacer based on our new
+       * cwnd estimate */
+      if (!tcp_in_fastrecovery (tc))
+	tcp_connection_tx_pacer_update (tc);
     }
   _vec_len (wrk->pending_deq_acked) = 0;
 }
@@ -1084,6 +1115,7 @@
   tcp_update_rto (tc);
   tc->snd_rxt_ts = 0;
   tc->snd_nxt = tc->snd_una_max;
+  tc->rtt_ts = 0;
   tcp_recovery_off (tc);
   TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3);
 }
@@ -1096,6 +1128,7 @@
   tc->rcv_dupacks = 0;
   tc->snd_nxt = tc->snd_una_max;
   tc->snd_rxt_bytes = 0;
+  tc->rtt_ts = 0;
 
   tcp_fastrecovery_off (tc);
   tcp_fastrecovery_1_smss_off (tc);
@@ -1381,6 +1414,10 @@
    * Legitimate ACK. 1) See if we can exit recovery
    */
 
+  /* Update the pacing rate. For the first partial ack we move from
+   * the artificially constrained rate to the one after congestion */
+  tcp_connection_tx_pacer_update (tc);
+
   if (seq_geq (tc->snd_una, tc->snd_congestion))
     {
       tcp_retransmit_timer_update (tc);
@@ -1403,10 +1440,6 @@
    * Legitimate ACK. 2) If PARTIAL ACK try to retransmit
    */
 
-  /* Update the pacing rate. For the first partial ack we move from
-   * the artificially constrained rate to the one after congestion */
-  tcp_connection_tx_pacer_update (tc);
-
   /* XXX limit this only to first partial ack? */
   tcp_retransmit_timer_force_update (tc);
 
@@ -2427,7 +2460,7 @@
 	    }
 
 	  /* Update rtt with the syn-ack sample */
-	  tcp_update_rtt (new_tc0, vnet_buffer (b0)->tcp.ack_number);
+	  tcp_estimate_initial_rtt (new_tc0);
 	  TCP_EVT_DBG (TCP_EVT_SYNACK_RCVD, new_tc0);
 	  error0 = TCP_ERROR_SYN_ACKS_RCVD;
 	}
@@ -2636,7 +2669,7 @@
 	    }
 
 	  /* Update rtt and rto */
-	  tcp_update_rtt (tc0, vnet_buffer (b0)->tcp.ack_number);
+	  tcp_estimate_initial_rtt (tc0);
 
 	  /* Switch state to ESTABLISHED */
 	  tc0->state = TCP_STATE_ESTABLISHED;
@@ -2687,6 +2720,12 @@
 	       * wait for peer's FIN but not indefinitely. */
 	      tcp_connection_timers_reset (tc0);
 	      tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
+
+	      /* Don't try to deq the FIN acked */
+	      if (tc0->burst_acked > 1)
+		stream_session_dequeue_drop (&tc0->connection,
+					     tc0->burst_acked - 1);
+	      tc0->burst_acked = 0;
 	    }
 	  break;
 	case TCP_STATE_FIN_WAIT_2:
@@ -2695,6 +2734,7 @@
 	   * acknowledged ("ok") but do not delete the TCB. */
 	  if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &error0))
 	    goto drop;
+	  tc0->burst_acked = 0;
 	  break;
 	case TCP_STATE_CLOSE_WAIT:
 	  /* Do the same processing as for the ESTABLISHED state. */
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index 089f85a..192e820 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -1000,7 +1000,7 @@
   tcp_make_syn (tc, b);
 
   /* Measure RTT with this */
-  tc->rtt_ts = tcp_time_now ();
+  tc->rtt_ts = tcp_time_now_us (vlib_num_workers ()? 1 : 0);
   tc->rtt_seq = tc->snd_nxt;
   tc->rto_boff = 0;