session/tcp: improve preallocated segment handling

- add preallocated segment flag
- don't remove pre-allocated segments except if application detaches
- when preallocating fifos in multiple segments, completely fill
  a segment before moving to the next
- detach server application from segment-managers when deleting app
- batch syn/syn-ack/fin (re)transmissions
- loosen up close-wait and time-wait times

Change-Id: I412f53ce601cc83b3acc26aeffd7fa2d52d73b03
Signed-off-by: Florin Coras <fcoras@cisco.com>
diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c
index 5b4c867..527b328 100644
--- a/src/vnet/tcp/builtin_client.c
+++ b/src/vnet/tcp/builtin_client.c
@@ -510,7 +510,7 @@
       if ((i % 4) == 0)
 	vlib_process_suspend (vm, 10e-6);
       ASSERT (i + 1 >= tm->ready_connections);
-      while (i + 1 - tm->ready_connections > 8000)
+      while (i + 1 - tm->ready_connections > 1000)
 	{
 	  vlib_process_suspend (vm, 100e-6);
 	}
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
index 04f1e06..f779428 100644
--- a/src/vnet/tcp/tcp.c
+++ b/src/vnet/tcp/tcp.c
@@ -1035,7 +1035,7 @@
   /* If not snd_wnd constrained and we can't write at least a segment,
    * don't try at all */
   if (PREDICT_FALSE (snd_space < tc->snd_mss))
-    return 0;
+    return snd_space < tc->cwnd ? 0 : snd_space;
 
   /* round down to mss multiple */
   return snd_space - (snd_space % tc->snd_mss);
@@ -1167,6 +1167,7 @@
     {
       ASSERT (tc->state == TCP_STATE_SYN_SENT);
       stream_session_connect_notify (&tc->connection, 1 /* fail */ );
+      TCP_DBG ("establish pop: %U", format_tcp_connection, tc, 2);
     }
   else
     {
@@ -1174,7 +1175,7 @@
       /* note: the connection may have already disappeared */
       if (PREDICT_FALSE (tc == 0))
 	return;
-
+      TCP_DBG ("establish pop: %U", format_tcp_connection, tc, 2);
       ASSERT (tc->state == TCP_STATE_SYN_RCVD);
       /* Start cleanup. App wasn't notified yet so use delete notify as
        * opposed to delete to cleanup session layer state. */
@@ -1369,6 +1370,8 @@
 
   vec_validate (tm->tx_frames[0], num_threads - 1);
   vec_validate (tm->tx_frames[1], num_threads - 1);
+  vec_validate (tm->ip_lookup_tx_frames[0], num_threads - 1);
+  vec_validate (tm->ip_lookup_tx_frames[1], num_threads - 1);
 
   tm->bytes_per_buffer = vlib_buffer_free_list_buffer_size
     (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index 6020a3d..bb8091a 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -99,8 +99,9 @@
 #define TCP_ESTABLISH_TIME      750	/* 75s */
 #define TCP_SYN_RCVD_TIME	600	/* 60s */
 #define TCP_2MSL_TIME           300	/* 30s */
-#define TCP_CLOSEWAIT_TIME	20	/* 0.1s */
-#define TCP_CLEANUP_TIME	5	/* 0.5s Time to wait before cleanup */
+#define TCP_CLOSEWAIT_TIME	20	/* 2s */
+#define TCP_TIMEWAIT_TIME	20	/* 2s */
+#define TCP_CLEANUP_TIME	10	/* 1s Time to wait before cleanup */
 #define TCP_TIMER_PERSIST_MIN	2	/* 0.2s */
 
 #define TCP_RTO_MAX 60 * THZ	/* Min max RTO (60s) as per RFC6298 */
@@ -372,8 +373,10 @@
 
   /** per-worker tx buffer free lists */
   u32 **tx_buffers;
-  /** per-worker tx frames to 4/6 output nodes */
+  /** per-worker tx frames to tcp 4/6 output nodes */
   vlib_frame_t **tx_frames[2];
+  /** per-worker tx frames to ip 4/6 lookup nodes */
+  vlib_frame_t **ip_lookup_tx_frames[2];
 
   /* Per worker-thread timer wheel for connections timers */
   tw_timer_wheel_16t_2w_512sl_t *timer_wheels;
diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h
index cf77e6e..4bc6b42 100755
--- a/src/vnet/tcp/tcp_debug.h
+++ b/src/vnet/tcp/tcp_debug.h
@@ -82,13 +82,7 @@
  * Infra and evt track setup
  */
 
-#define TCP_DBG(_tc, _evt, _args...)					\
-{   		            						\
-    u8 *_tmp = 0;							\
-    _tmp = format(_tmp, "%U", format_tcp_connection_verbose, _tc);	\
-    clib_warning("%s", _tmp);						\
-    vec_free(_tmp);							\
-}
+#define TCP_DBG(_fmt, _args...) clib_warning (_fmt, ##_args)
 
 #define DECLARE_ETD(_tc, _e, _size)					\
   struct								\
@@ -240,6 +234,7 @@
 #define TCP_EVT_DBG(_evt, _args...) CC(_evt, _HANDLER)(_args)
 #else
 #define TCP_EVT_DBG(_evt, _args...)
+#define TCP_DBG(_fmt, _args...)
 #endif
 
 /*
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index 841e72a..64a0707 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -351,12 +351,17 @@
   if (tcp_syn (th0))
     {
       /* TODO implement RFC 5961 */
-      if (tc0->state != TCP_STATE_SYN_RCVD)
-	tcp_make_ack (tc0, b0);
+      if (tc0->state == TCP_STATE_SYN_RCVD)
+	{
+	  tcp_make_synack (tc0, b0);
+	  TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0, 0);
+	}
       else
-	tcp_make_synack (tc0, b0);
+	{
+	  tcp_make_ack (tc0, b0);
+	  TCP_EVT_DBG (TCP_EVT_SYNACK_RCVD, tc0);
+	}
       *next0 = tcp_next_output (tc0->c_is_ip4);
-      TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0, 0);
       return -1;
     }
 
@@ -1747,18 +1752,17 @@
 	  /* 8: check the FIN bit */
 	  if (PREDICT_FALSE (is_fin))
 	    {
-	      /* Enter CLOSE-WAIT and notify session. Don't send ACK, instead
-	       * wait for session to call close. To avoid lingering
+	      /* Enter CLOSE-WAIT and notify session. To avoid lingering
 	       * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */
-	      tc0->state = TCP_STATE_CLOSE_WAIT;
-	      TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0);
+	      /* Account for the FIN if nothing else was received */
 	      if (vnet_buffer (b0)->tcp.data_len == 0)
-		{
-		  tc0->rcv_nxt += 1;
-		  next0 = TCP_ESTABLISHED_NEXT_DROP;
-		}
+		tc0->rcv_nxt += 1;
+	      tcp_make_ack (tc0, b0);
+	      next0 = tcp_next_output (tc0->c_is_ip4);
+	      tc0->state = TCP_STATE_CLOSE_WAIT;
 	      stream_session_disconnect_notify (&tc0->connection);
 	      tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME);
+	      TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0);
 	    }
 
 	done:
@@ -1973,6 +1977,12 @@
 	  seq0 = vnet_buffer (b0)->tcp.seq_number;
 	  tcp0 = tcp_buffer_hdr (b0);
 
+	  /* Crude check to see if the connection handle does not match
+	   * the packet. Probably connection just switched to established */
+	  if (PREDICT_FALSE (tcp0->dst_port != tc0->c_lcl_port
+			     || tcp0->src_port != tc0->c_rmt_port))
+	    goto drop;
+
 	  if (PREDICT_FALSE
 	      (!tcp_ack (tcp0) && !tcp_rst (tcp0) && !tcp_syn (tcp0)))
 	    goto drop;
@@ -2265,6 +2275,7 @@
 	  tcp_header_t *tcp0 = 0;
 	  tcp_connection_t *tc0;
 	  u32 next0 = TCP_RCV_PROCESS_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED;
+	  u8 is_fin0;
 
 	  bi0 = from[0];
 	  to_next[0] = bi0;
@@ -2283,11 +2294,11 @@
 	    }
 
 	  tcp0 = tcp_buffer_hdr (b0);
+	  is_fin0 = tcp_is_fin (tcp0);
 
 	  /* SYNs, FINs and data consume sequence numbers */
 	  vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number
-	    + tcp_is_syn (tcp0) + tcp_is_fin (tcp0)
-	    + vnet_buffer (b0)->tcp.data_len;
+	    + tcp_is_syn (tcp0) + is_fin0 + vnet_buffer (b0)->tcp.data_len;
 
 	  if (CLIB_DEBUG)
 	    {
@@ -2384,21 +2395,14 @@
 	      /* If FIN is ACKed */
 	      else if (tc0->snd_una == tc0->snd_una_max)
 		{
-		  tc0->rcv_nxt += 1;
 		  tc0->state = TCP_STATE_FIN_WAIT_2;
 		  TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
 
-		  if (tcp_fin (tcp0))
-		    {
-		      /* Stop all timers, 2MSL will be set lower */
-		      tcp_connection_timers_reset (tc0);
-		    }
-		  else
-		    {
-		      /* Wait for peer to finish sending its data */
-		      tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE,
-					TCP_2MSL_TIME);
-		    }
+		  /* Stop all retransmit timers because we have nothing more
+		   * to send. Enable waitclose though because we're willing to
+		   * wait for peer's FIN but not indefinitely. */
+		  tcp_connection_timers_reset (tc0);
+		  tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
 		}
 	      break;
 	    case TCP_STATE_FIN_WAIT_2:
@@ -2434,10 +2438,10 @@
 	      if (!tcp_rcv_ack_is_acceptable (tc0, b0))
 		goto drop;
 
+	      tc0->snd_una = vnet_buffer (b0)->tcp.ack_number;
 	      /* Apparently our FIN was lost */
-	      if (tcp_fin (tcp0))
+	      if (is_fin0)
 		{
-		  /* Don't "make" fin since that increments snd_nxt */
 		  tcp_send_fin (tc0);
 		  goto drop;
 		}
@@ -2450,8 +2454,6 @@
 	       * particular, this makes sure that we won't have dead sessions
 	       * when processing events on the tx path */
 	      tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME);
-
-	      /* Stop retransmit */
 	      tcp_retransmit_timer_reset (tc0);
 
 	      goto drop;
@@ -2466,8 +2468,7 @@
 		goto drop;
 
 	      tcp_make_ack (tc0, b0);
-	      tcp_timer_reset (tc0, TCP_TIMER_WAITCLOSE);
-	      tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
+	      tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
 
 	      goto drop;
 
@@ -2486,6 +2487,8 @@
 	    case TCP_STATE_FIN_WAIT_2:
 	      if (vnet_buffer (b0)->tcp.data_len)
 		error0 = tcp_segment_rcv (tm, tc0, b0, &next0);
+	      else if (is_fin0)
+		tc0->rcv_nxt += 1;
 	      break;
 	    case TCP_STATE_CLOSE_WAIT:
 	    case TCP_STATE_CLOSING:
@@ -2497,7 +2500,7 @@
 	    }
 
 	  /* 8: check the FIN bit */
-	  if (!tcp_fin (tcp0))
+	  if (!is_fin0)
 	    goto drop;
 
 	  switch (tc0->state)
@@ -2527,19 +2530,19 @@
 	      tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
 	      break;
 	    case TCP_STATE_FIN_WAIT_2:
-	      /* Got FIN, send ACK! */
+	      /* Got FIN, send ACK! Be more aggressive with resource cleanup */
 	      tc0->state = TCP_STATE_TIME_WAIT;
 	      tcp_connection_timers_reset (tc0);
-	      tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME);
+	      tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_TIMEWAIT_TIME);
 	      tcp_make_ack (tc0, b0);
 	      next0 = tcp_next_output (is_ip4);
 	      TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
 	      break;
 	    case TCP_STATE_TIME_WAIT:
-	      /* Remain in the TIME-WAIT state. Restart the 2 MSL time-wait
+	      /* Remain in the TIME-WAIT state. Restart the time-wait
 	       * timeout.
 	       */
-	      tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
+	      tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_TIMEWAIT_TIME);
 	      break;
 	    }
 	  TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0);
@@ -3162,9 +3165,9 @@
     TCP_ERROR_NONE);
   _(TIME_WAIT, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
   _(TIME_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
-  _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_CONNECTION_CLOSED);
+  _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED);
   _(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED);
-  _(CLOSED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET,
+  _(CLOSED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP,
     TCP_ERROR_CONNECTION_CLOSED);
 #undef _
 }
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index b843c92..be29f05 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -629,9 +629,11 @@
 }
 
 always_inline void
-tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi,
-			  u8 is_ip4)
+tcp_enqueue_to_ip_lookup_i (vlib_main_t * vm, vlib_buffer_t * b, u32 bi,
+			    u8 is_ip4, u8 flush)
 {
+  tcp_main_t *tm = vnet_get_tcp_main ();
+  u32 thread_index = vlib_get_thread_index ();
   u32 *to_next, next_index;
   vlib_frame_t *f;
 
@@ -643,13 +645,42 @@
 
   /* Send to IP lookup */
   next_index = is_ip4 ? ip4_lookup_node.index : ip6_lookup_node.index;
-  f = vlib_get_frame_to_node (vm, next_index);
+  if (VLIB_BUFFER_TRACE_TRAJECTORY > 0)
+    {
+      b->pre_data[0] = 2;
+      b->pre_data[1] = next_index;
+    }
 
-  /* Enqueue the packet */
+  f = tm->ip_lookup_tx_frames[!is_ip4][thread_index];
+  if (!f)
+    {
+      f = vlib_get_frame_to_node (vm, next_index);
+      ASSERT (f);
+      tm->ip_lookup_tx_frames[!is_ip4][thread_index] = f;
+    }
+
   to_next = vlib_frame_vector_args (f);
-  to_next[0] = bi;
-  f->n_vectors = 1;
-  vlib_put_frame_to_node (vm, next_index, f);
+  to_next[f->n_vectors] = bi;
+  f->n_vectors += 1;
+  if (flush || f->n_vectors == VLIB_FRAME_SIZE)
+    {
+      vlib_put_frame_to_node (vm, next_index, f);
+      tm->ip_lookup_tx_frames[!is_ip4][thread_index] = 0;
+    }
+}
+
+always_inline void
+tcp_enqueue_to_ip_lookup_now (vlib_main_t * vm, vlib_buffer_t * b, u32 bi,
+			      u8 is_ip4)
+{
+  tcp_enqueue_to_ip_lookup_i (vm, b, bi, is_ip4, 1);
+}
+
+always_inline void
+tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi,
+			  u8 is_ip4)
+{
+  tcp_enqueue_to_ip_lookup_i (vm, b, bi, is_ip4, 0);
 }
 
 always_inline void
@@ -666,8 +697,6 @@
 
   /* Decide where to send the packet */
   next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index;
-
-  /* Initialize the trajectory trace, if configured */
   if (VLIB_BUFFER_TRACE_TRAJECTORY > 0)
     {
       b->pre_data[0] = 1;
@@ -856,7 +885,7 @@
       ASSERT (!bogus);
     }
 
-  tcp_enqueue_to_ip_lookup (vm, b, bi, is_ip4);
+  tcp_enqueue_to_ip_lookup_now (vm, b, bi, is_ip4);
   TCP_EVT_DBG (TCP_EVT_RST_SENT, tc);
 }
 
@@ -968,7 +997,24 @@
 }
 
 /**
- * Flush both v4 and v6 tx frames for thread index
+ * Flush ip lookup tx frames populated by timer pops
+ */
+always_inline void
+tcp_flush_frame_to_ip_lookup (vlib_main_t * vm, u8 thread_index, u8 is_ip4)
+{
+  if (tcp_main.ip_lookup_tx_frames[!is_ip4][thread_index])
+    {
+      u32 next_index;
+      next_index = is_ip4 ? ip4_lookup_node.index : ip6_lookup_node.index;
+      vlib_put_frame_to_node (vm, next_index,
+			      tcp_main.ip_lookup_tx_frames[!is_ip4]
+			      [thread_index]);
+      tcp_main.ip_lookup_tx_frames[!is_ip4][thread_index] = 0;
+    }
+}
+
+/**
+ * Flush v4 and v6 tcp and ip-lookup tx frames for thread index
  */
 void
 tcp_flush_frames_to_output (u8 thread_index)
@@ -976,6 +1022,8 @@
   vlib_main_t *vm = vlib_get_main ();
   tcp_flush_frame_to_output (vm, thread_index, 1);
   tcp_flush_frame_to_output (vm, thread_index, 0);
+  tcp_flush_frame_to_ip_lookup (vm, thread_index, 1);
+  tcp_flush_frame_to_ip_lookup (vm, thread_index, 0);
 }
 
 /**
@@ -984,22 +1032,28 @@
 void
 tcp_send_fin (tcp_connection_t * tc)
 {
-  vlib_buffer_t *b;
-  u32 bi;
   tcp_main_t *tm = vnet_get_tcp_main ();
   vlib_main_t *vm = vlib_get_main ();
+  vlib_buffer_t *b;
+  u32 bi;
+  u8 fin_snt = 0;
+
 
   if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
     return;
   b = vlib_get_buffer (vm, bi);
-  /* buffer will be initialized by in tcp_make_fin */
+  fin_snt = tc->flags & TCP_CONN_FINSNT;
+  if (fin_snt)
+    tc->snd_nxt = tc->snd_una;
   tcp_make_fin (tc, b);
   tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4);
-  if (!(tc->flags & TCP_CONN_FINSNT))
+  if (!fin_snt)
     {
       tc->flags |= TCP_CONN_FINSNT;
       tc->flags &= ~TCP_CONN_FINPNDG;
-      tc->snd_nxt += 1;
+      /* Account for the FIN */
+      tc->snd_una_max += 1;
+      tc->snd_nxt = tc->snd_una_max;
     }
   tcp_retransmit_timer_force_update (tc);
   TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc);
@@ -1398,7 +1452,8 @@
   else if (tc->state == TCP_STATE_SYN_RCVD)
     {
       tc->rto_boff += 1;
-      tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
+      if (tc->rto_boff > TCP_RTO_SYN_RETRIES)
+	tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
       tc->rtt_ts = 0;
 
       if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
@@ -1414,7 +1469,7 @@
   else
     {
       ASSERT (tc->state == TCP_STATE_CLOSED);
-      clib_warning ("connection closed ...");
+      TCP_DBG ("connection state: %d", tc->state);
       return;
     }
 }