tcp: horizontal scaling improvments

- do not scale syn-ack window
- fix the max number of outstanding syns in builtin client
- fix syn-sent ack validation to use modulo arithmetic
- improve retransmit timer handler
- fix output buffer allocator leakeage
- improved debugging

Change-Id: Iac3bc0eadf7d0b494a93e22d210a3153b61b3273
Signed-off-by: Florin Coras <fcoras@cisco.com>
diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c
index 94e6b4a..5b4c867 100644
--- a/src/vnet/tcp/builtin_client.c
+++ b/src/vnet/tcp/builtin_client.c
@@ -509,6 +509,11 @@
       /* Crude pacing for call setups  */
       if ((i % 4) == 0)
 	vlib_process_suspend (vm, 10e-6);
+      ASSERT (i + 1 >= tm->ready_connections);
+      while (i + 1 - tm->ready_connections > 8000)
+	{
+	  vlib_process_suspend (vm, 100e-6);
+	}
     }
 }
 
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
index a4c1308..04f1e06 100644
--- a/src/vnet/tcp/tcp.c
+++ b/src/vnet/tcp/tcp.c
@@ -160,6 +160,7 @@
 {
   tcp_main_t *tm = vnet_get_tcp_main ();
   tcp_connection_t *tc = 0;
+  ASSERT (vlib_get_thread_index () == 0);
   pool_get (tm->half_open_connections, tc);
   memset (tc, 0, sizeof (*tc));
   tc->c_c_index = tc - tm->half_open_connections;
@@ -561,6 +562,22 @@
 }
 #endif /* 0 */
 
+/**
+ * Initialize connection send variables.
+ */
+void
+tcp_init_snd_vars (tcp_connection_t * tc)
+{
+  u32 time_now;
+
+  /* Set random initial sequence */
+  time_now = tcp_time_now ();
+  tc->iss = random_u32 (&time_now);
+  tc->snd_una = tc->iss;
+  tc->snd_nxt = tc->iss + 1;
+  tc->snd_una_max = tc->snd_nxt;
+}
+
 /** Initialize tcp connection variables
  *
  * Should be called after having received a msg from the peer, i.e., a SYN or
@@ -572,6 +589,9 @@
   tcp_init_mss (tc);
   scoreboard_init (&tc->sack_sb);
   tcp_cc_init (tc);
+  if (tc->state == TCP_STATE_SYN_RCVD)
+    tcp_init_snd_vars (tc);
+
   //  tcp_connection_fib_attach (tc);
 }
 
@@ -691,6 +711,7 @@
 
   TCP_EVT_DBG (TCP_EVT_OPEN, tc);
   tc->state = TCP_STATE_SYN_SENT;
+  tcp_init_snd_vars (tc);
   tcp_send_syn (tc);
   clib_spinlock_unlock_if_init (&tm->half_open_lock);
 
@@ -784,7 +805,7 @@
 	      tc->snd_wnd, tc->rcv_wnd, tc->snd_wl1 - tc->irs,
 	      tc->snd_wl2 - tc->iss);
   s = format (s, " flight size %u send space %u rcv_wnd_av %d\n",
-	      tcp_flight_size (tc), tcp_available_snd_space (tc),
+	      tcp_flight_size (tc), tcp_available_output_snd_space (tc),
 	      tcp_rcv_wnd_available (tc));
   s = format (s, " cong %U ", format_tcp_congestion_status, tc);
   s = format (s, "cwnd %u ssthresh %u rtx_bytes %u bytes_acked %u\n",
@@ -1155,6 +1176,9 @@
 	return;
 
       ASSERT (tc->state == TCP_STATE_SYN_RCVD);
+      /* Start cleanup. App wasn't notified yet so use delete notify as
+       * opposed to delete to cleanup session layer state. */
+      stream_session_delete_notify (&tc->connection);
     }
   tc->timers[TCP_TIMER_ESTABLISH] = TCP_TIMER_HANDLE_INVALID;
   tcp_connection_cleanup (tc);
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index 11d61f5..6020a3d 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -97,7 +97,7 @@
 						 * ticks to timer units */
 #define TCP_DELACK_TIME         1	/* 0.1s */
 #define TCP_ESTABLISH_TIME      750	/* 75s */
-#define TCP_SYN_RCVD_TIME	100	/* 10s */
+#define TCP_SYN_RCVD_TIME	600	/* 60s */
 #define TCP_2MSL_TIME           300	/* 30s */
 #define TCP_CLOSEWAIT_TIME	20	/* 0.1s */
 #define TCP_CLEANUP_TIME	5	/* 0.5s Time to wait before cleanup */
@@ -676,6 +676,7 @@
 
 void tcp_connection_timers_init (tcp_connection_t * tc);
 void tcp_connection_timers_reset (tcp_connection_t * tc);
+void tcp_init_snd_vars (tcp_connection_t * tc);
 void tcp_connection_init_vars (tcp_connection_t * tc);
 
 always_inline void
@@ -690,6 +691,7 @@
 tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval)
 {
   ASSERT (tc->c_thread_index == vlib_get_thread_index ());
+  ASSERT (tc->timers[timer_id] == TCP_TIMER_HANDLE_INVALID);
   tc->timers[timer_id]
     = tw_timer_start_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index],
 				   tc->c_c_index, timer_id, interval);
@@ -722,6 +724,7 @@
 always_inline void
 tcp_retransmit_timer_set (tcp_connection_t * tc)
 {
+  ASSERT (tc->snd_una != tc->snd_una_max);
   tcp_timer_set (tc, TCP_TIMER_RETRANSMIT,
 		 clib_max (tc->rto * TCP_TO_TIMER_TICK, 1));
 }
@@ -769,7 +772,7 @@
     {
       tcp_retransmit_timer_reset (tc);
       if (tc->snd_wnd < tc->snd_mss)
-	tcp_persist_timer_set (tc);
+	tcp_persist_timer_update (tc);
     }
   else
     tcp_timer_update (tc, TCP_TIMER_RETRANSMIT,
diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h
index fc36eb2..cf77e6e 100755
--- a/src/vnet/tcp/tcp_debug.h
+++ b/src/vnet/tcp/tcp_debug.h
@@ -197,9 +197,10 @@
   ed->data[0] = _tc->c_c_index;						\
 }
 
-#define TCP_EVT_SYN_RCVD_HANDLER(_tc, ...)				\
+#define TCP_EVT_SYN_RCVD_HANDLER(_tc,_init, ...)				\
 {									\
-  TCP_EVT_INIT_HANDLER(_tc, 0);						\
+  if (_init)								\
+    TCP_EVT_INIT_HANDLER(_tc, 0);					\
   ELOG_TYPE_DECLARE (_e) =						\
   {									\
     .format = "syn-rx: irs %u",						\
@@ -275,11 +276,14 @@
 {									\
   ELOG_TYPE_DECLARE (_e) =						\
   {									\
-    .format = "syn-tx: iss %u",						\
-    .format_args = "i4",						\
+    .format = "syn-tx: iss %u snd_una %u snd_una_max %u snd_nxt %u",	\
+    .format_args = "i4i4i4i4",						\
   };									\
-  DECLARE_ETD(_tc, _e, 1);						\
+  DECLARE_ETD(_tc, _e, 4);						\
   ed->data[0] = _tc->iss;						\
+  ed->data[1] = _tc->snd_una - _tc->iss;					\
+  ed->data[2] = _tc->snd_una_max - _tc->iss;				\
+  ed->data[3] = _tc->snd_nxt - _tc->iss;					\
   TCP_EVT_STATE_CHANGE_HANDLER(_tc);					\
 }
 
@@ -287,24 +291,30 @@
 {									\
   ELOG_TYPE_DECLARE (_e) =						\
   {									\
-    .format = "synack-tx: iss %u irs %u",				\
-    .format_args = "i4i4",						\
+    .format = "synack-tx: iss %u irs %u snd_una %u snd_nxt %u rcv_nxt %u",\
+    .format_args = "i4i4i4i4i4",						\
   };									\
-  DECLARE_ETD(_tc, _e, 2);						\
+  DECLARE_ETD(_tc, _e, 5);						\
   ed->data[0] = _tc->iss;						\
   ed->data[1] = _tc->irs;						\
+  ed->data[2] = _tc->snd_una - _tc->iss;					\
+  ed->data[3] = _tc->snd_nxt - _tc->iss;					\
+  ed->data[4] = _tc->rcv_nxt - _tc->irs;					\
 }
 
 #define TCP_EVT_SYNACK_RCVD_HANDLER(_tc, ...)				\
 {									\
   ELOG_TYPE_DECLARE (_e) =						\
   {									\
-    .format = "synack-rx: iss %u irs %u",				\
-    .format_args = "i4i4",						\
+    .format = "synack-rx: iss %u irs %u snd_una %u snd_nxt %u rcv_nxt %u",\
+    .format_args = "i4i4i4i4i4",						\
   };									\
-  DECLARE_ETD(_tc, _e, 2);						\
+  DECLARE_ETD(_tc, _e, 5);						\
   ed->data[0] = _tc->iss;						\
   ed->data[1] = _tc->irs;						\
+  ed->data[2] = _tc->snd_una - _tc->iss;					\
+  ed->data[3] = _tc->snd_nxt - _tc->iss;					\
+  ed->data[4] = _tc->rcv_nxt - _tc->irs;					\
   TCP_EVT_STATE_CHANGE_HANDLER(_tc);					\
 }
 
@@ -361,17 +371,20 @@
 {									\
   ELOG_TYPE_DECLARE (_e) =						\
   {									\
-    .format = "%s-rxt: iss %u",						\
-    .format_args = "t4i4",						\
+    .format = "%s-rxt: iss %u irs %u snd_nxt %u rcv_nxt %u",		\
+    .format_args = "t4i4i4i4i4",						\
     .n_enum_strings = 2,						\
     .enum_strings = {                                           	\
 	"syn",	                                             		\
         "syn-ack",							\
     },  								\
   };									\
-  DECLARE_ETD(_tc, _e, 2);						\
+  DECLARE_ETD(_tc, _e, 5);						\
   ed->data[0] = _type;							\
   ed->data[1] = _tc->iss;						\
+  ed->data[2] = _tc->irs;						\
+  ed->data[3] = _tc->snd_nxt - _tc->iss;					\
+  ed->data[4] = _tc->rcv_nxt - _tc->irs;					\
 }
 
 #else
@@ -414,7 +427,7 @@
   ed->data[0] = _tc->rcv_nxt - _tc->irs;				\
   ed->data[1] = _tc->rcv_wnd;						\
   ed->data[2] = _tc->snd_nxt - _tc->iss;				\
-  ed->data[3] = tcp_available_wnd(_tc);					\
+  ed->data[3] = tcp_available_snd_wnd(_tc);				\
   ed->data[4] = _tc->snd_wnd;						\
 }
 
@@ -422,7 +435,7 @@
 {									\
   ELOG_TYPE_DECLARE (_e) =						\
   {									\
-    .format = "acked: %u snd_una %u snd_wnd %u cwnd %u inflight %u",	\
+    .format = "ack-rx: %u snd_una %u snd_wnd %u cwnd %u inflight %u",	\
     .format_args = "i4i4i4i4i4",					\
   };									\
   DECLARE_ETD(_tc, _e, 5);						\
@@ -452,13 +465,13 @@
 {									\
   ELOG_TYPE_DECLARE (_e) =						\
   {									\
-    .format = "pktize: una %u snd_nxt %u space %u flight %u rcv_wnd %u",\
+    .format = "tx: una %u snd_nxt %u space %u flight %u rcv_wnd %u",\
     .format_args = "i4i4i4i4i4",					\
   };									\
   DECLARE_ETD(_tc, _e, 5);						\
   ed->data[0] = _tc->snd_una - _tc->iss;				\
   ed->data[1] = _tc->snd_nxt - _tc->iss;				\
-  ed->data[2] = tcp_available_snd_space (_tc);				\
+  ed->data[2] = tcp_available_output_snd_space (_tc);			\
   ed->data[3] = tcp_flight_size (_tc);					\
   ed->data[4] = _tc->rcv_wnd;						\
 }
diff --git a/src/vnet/tcp/tcp_error.def b/src/vnet/tcp/tcp_error.def
index a4e46d6..0892231 100644
--- a/src/vnet/tcp/tcp_error.def
+++ b/src/vnet/tcp/tcp_error.def
@@ -38,4 +38,5 @@
 tcp_error (RST_SENT, "Resets sent")
 tcp_error (INVALID_CONNECTION, "Invalid connection")
 tcp_error (NO_WND, "No window")
-tcp_error (CONNECTION_CLOSED, "Connection closed")
\ No newline at end of file
+tcp_error (CONNECTION_CLOSED, "Connection closed")
+tcp_error (CREATE_EXISTS, "Connection already exists")
\ No newline at end of file
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index 1d90345..841e72a 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -275,6 +275,7 @@
 
   if (PREDICT_FALSE (tcp_options_parse (th0, &tc0->rcv_opts)))
     {
+      clib_warning ("options parse error");
       return -1;
     }
 
@@ -350,9 +351,12 @@
   if (tcp_syn (th0))
     {
       /* TODO implement RFC 5961 */
-      tcp_make_ack (tc0, b0);
+      if (tc0->state != TCP_STATE_SYN_RCVD)
+	tcp_make_ack (tc0, b0);
+      else
+	tcp_make_synack (tc0, b0);
       *next0 = tcp_next_output (tc0->c_is_ip4);
-      TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0);
+      TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0, 0);
       return -1;
     }
 
@@ -1842,6 +1846,74 @@
 vlib_node_registration_t tcp4_syn_sent_node;
 vlib_node_registration_t tcp6_syn_sent_node;
 
+static u8
+tcp_lookup_is_valid (tcp_connection_t * tc, tcp_header_t * hdr)
+{
+  transport_connection_t *tmp;
+  if (!tc)
+    return 1;
+
+  u8 is_valid = (tc->c_lcl_port == hdr->dst_port
+		 && (tc->state == TCP_STATE_LISTEN
+		     || tc->c_rmt_port == hdr->src_port));
+
+  if (!is_valid)
+    {
+      if ((tmp =
+	   stream_session_half_open_lookup (&tc->c_lcl_ip, &tc->c_rmt_ip,
+					    tc->c_lcl_port, tc->c_rmt_port,
+					    tc->c_transport_proto)))
+	{
+	  if (tmp->lcl_port == hdr->dst_port
+	      && tmp->rmt_port == hdr->src_port)
+	    {
+	      clib_warning ("half-open is valid!");
+	    }
+	}
+    }
+  return is_valid;
+}
+
+/**
+ * Lookup transport connection
+ */
+static tcp_connection_t *
+tcp_lookup_connection (vlib_buffer_t * b, u8 thread_index, u8 is_ip4)
+{
+  tcp_header_t *tcp;
+  transport_connection_t *tconn;
+  tcp_connection_t *tc;
+  if (is_ip4)
+    {
+      ip4_header_t *ip4;
+      ip4 = vlib_buffer_get_current (b);
+      tcp = ip4_next_header (ip4);
+      tconn = stream_session_lookup_transport_wt4 (&ip4->dst_address,
+						   &ip4->src_address,
+						   tcp->dst_port,
+						   tcp->src_port,
+						   SESSION_TYPE_IP4_TCP,
+						   thread_index);
+      tc = tcp_get_connection_from_transport (tconn);
+      ASSERT (tcp_lookup_is_valid (tc, tcp));
+    }
+  else
+    {
+      ip6_header_t *ip6;
+      ip6 = vlib_buffer_get_current (b);
+      tcp = ip6_next_header (ip6);
+      tconn = stream_session_lookup_transport_wt6 (&ip6->dst_address,
+						   &ip6->src_address,
+						   tcp->dst_port,
+						   tcp->src_port,
+						   SESSION_TYPE_IP6_TCP,
+						   thread_index);
+      tc = tcp_get_connection_from_transport (tconn);
+      ASSERT (tcp_lookup_is_valid (tc, tcp));
+    }
+  return tc;
+}
+
 always_inline uword
 tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 		       vlib_frame_t * from_frame, int is_ip4)
@@ -1888,6 +1960,15 @@
 	      goto drop;
 	    }
 
+	  /* Half-open completed recently but the connection was't removed
+	   * yet by the owning thread */
+	  if (PREDICT_FALSE (tc0->flags & TCP_CONN_HALF_OPEN_DONE))
+	    {
+	      /* Make sure the connection actually exists */
+	      ASSERT (tcp_lookup_connection (b0, my_thread_index, is_ip4));
+	      goto drop;
+	    }
+
 	  ack0 = vnet_buffer (b0)->tcp.ack_number;
 	  seq0 = vnet_buffer (b0)->tcp.seq_number;
 	  tcp0 = tcp_buffer_hdr (b0);
@@ -1914,16 +1995,20 @@
 	   */
 	  if (tcp_ack (tcp0))
 	    {
-	      if (ack0 <= tc0->iss || ack0 > tc0->snd_nxt)
+	      if (seq_leq (ack0, tc0->iss) || seq_gt (ack0, tc0->snd_nxt))
 		{
+		  clib_warning ("ack not in rcv wnd");
 		  if (!tcp_rst (tcp0))
 		    tcp_send_reset_w_pkt (tc0, b0, is_ip4);
 		  goto drop;
 		}
 
 	      /* Make sure ACK is valid */
-	      if (tc0->snd_una > ack0)
-		goto drop;
+	      if (seq_gt (tc0->snd_una, ack0))
+		{
+		  clib_warning ("ack invalid");
+		  goto drop;
+		}
 	    }
 
 	  /*
@@ -1949,11 +2034,17 @@
 
 	  /* No SYN flag. Drop. */
 	  if (!tcp_syn (tcp0))
-	    goto drop;
+	    {
+	      clib_warning ("not synack");
+	      goto drop;
+	    }
 
 	  /* Parse options */
 	  if (tcp_options_parse (tcp0, &tc0->rcv_opts))
-	    goto drop;
+	    {
+	      clib_warning ("options parse fail");
+	      goto drop;
+	    }
 
 	  /* Valid SYN or SYN-ACK. Move connection from half-open pool to
 	   * current thread pool. */
@@ -1981,8 +2072,8 @@
 	  if (tcp_opts_wscale (&new_tc0->rcv_opts))
 	    new_tc0->snd_wscale = new_tc0->rcv_opts.wscale;
 
-	  new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window)
-	    << new_tc0->snd_wscale;
+	  /* RFC1323: SYN and SYN-ACK wnd not scaled */
+	  new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window);
 	  new_tc0->snd_wl1 = seq0;
 	  new_tc0->snd_wl2 = ack0;
 
@@ -2004,6 +2095,7 @@
 	       * allocate session send reset */
 	      if (stream_session_connect_notify (&new_tc0->connection, 0))
 		{
+		  clib_warning ("connect notify fail");
 		  tcp_send_reset_w_pkt (new_tc0, b0, is_ip4);
 		  tcp_connection_cleanup (new_tc0);
 		  goto drop;
@@ -2032,6 +2124,7 @@
 		}
 
 	      tc0->rtt_ts = 0;
+	      tcp_init_snd_vars (tc0);
 	      tcp_make_synack (new_tc0, b0);
 	      next0 = tcp_next_output (is_ip4);
 
@@ -2196,6 +2289,18 @@
 	    + tcp_is_syn (tcp0) + tcp_is_fin (tcp0)
 	    + vnet_buffer (b0)->tcp.data_len;
 
+	  if (CLIB_DEBUG)
+	    {
+	      tcp_connection_t *tmp;
+	      tmp = tcp_lookup_connection (b0, my_thread_index, is_ip4);
+	      if (tmp->state != tc0->state)
+		{
+		  clib_warning ("state changed");
+		  ASSERT (0);
+		  goto drop;
+		}
+	    }
+
 	  /*
 	   * Special treatment for CLOSED
 	   */
@@ -2211,8 +2316,8 @@
 	   */
 
 	  /* 1-4: check SEQ, RST, SYN */
-	  if (PREDICT_FALSE
-	      (tcp_segment_validate (vm, tc0, b0, tcp0, &next0)))
+	  if (PREDICT_FALSE (tcp_segment_validate (vm, tc0, b0, tcp0,
+						   &next0)))
 	    {
 	      error0 = TCP_ERROR_SEGMENT_INVALID;
 	      goto drop;
@@ -2230,6 +2335,7 @@
 	       */
 	      if (!tcp_rcv_ack_is_acceptable (tc0, b0))
 		{
+		  clib_warning ("connection not accepted");
 		  tcp_send_reset_w_pkt (tc0, b0, is_ip4);
 		  goto drop;
 		}
@@ -2252,6 +2358,7 @@
 	      /* Reset SYN-ACK retransmit and SYN_RCV establish timers */
 	      tcp_retransmit_timer_reset (tc0);
 	      tcp_timer_reset (tc0, TCP_TIMER_ESTABLISH);
+	      TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0);
 	      break;
 	    case TCP_STATE_ESTABLISHED:
 	      /* We can get packets in established state here because they
@@ -2400,6 +2507,7 @@
 	      /* Send FIN-ACK notify app and enter CLOSE-WAIT */
 	      tcp_connection_timers_reset (tc0);
 	      tcp_make_fin (tc0, b0);
+	      tc0->snd_nxt += 1;
 	      next0 = tcp_next_output (tc0->c_is_ip4);
 	      stream_session_disconnect_notify (&tc0->connection);
 	      tc0->state = TCP_STATE_CLOSE_WAIT;
@@ -2598,6 +2706,14 @@
 
 	  /* 3. check for a SYN (did that already) */
 
+	  /* Make sure connection wasn't just created */
+	  child0 = tcp_lookup_connection (b0, my_thread_index, is_ip4);
+	  if (PREDICT_FALSE (child0->state != TCP_STATE_LISTEN))
+	    {
+	      error0 = TCP_ERROR_CREATE_EXISTS;
+	      goto drop;
+	    }
+
 	  /* Create child session and send SYN-ACK */
 	  child0 = tcp_connection_new (my_thread_index);
 	  child0->c_lcl_port = lc0->c_lcl_port;
@@ -2621,12 +2737,15 @@
 	  if (stream_session_accept (&child0->connection, lc0->c_s_index, sst,
 				     0 /* notify */ ))
 	    {
+	      clib_warning ("session accept fail");
+	      tcp_connection_cleanup (child0);
 	      error0 = TCP_ERROR_CREATE_SESSION_FAIL;
 	      goto drop;
 	    }
 
 	  if (tcp_options_parse (th0, &child0->rcv_opts))
 	    {
+	      clib_warning ("options parse fail");
 	      goto drop;
 	    }
 
@@ -2651,7 +2770,7 @@
 	  child0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
 
 	  tcp_connection_init_vars (child0);
-	  TCP_EVT_DBG (TCP_EVT_SYN_RCVD, child0);
+	  TCP_EVT_DBG (TCP_EVT_SYN_RCVD, child0, 1);
 
 	  /* Reuse buffer to make syn-ack and send */
 	  tcp_make_synack (child0, b0);
@@ -2768,34 +2887,6 @@
 
 #define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN)
 
-static u8
-tcp_lookup_is_valid (tcp_connection_t * tc, tcp_header_t * hdr)
-{
-  transport_connection_t *tmp;
-  if (!tc)
-    return 1;
-
-  u8 is_valid = (tc->c_lcl_port == hdr->dst_port
-		 && (tc->state == TCP_STATE_LISTEN
-		     || tc->c_rmt_port == hdr->src_port));
-
-  if (!is_valid)
-    {
-      if ((tmp =
-	   stream_session_half_open_lookup (&tc->c_lcl_ip, &tc->c_rmt_ip,
-					    tc->c_lcl_port, tc->c_rmt_port,
-					    tc->c_transport_proto)))
-	{
-	  if (tmp->lcl_port == hdr->dst_port
-	      && tmp->rmt_port == hdr->src_port)
-	    {
-	      clib_warning ("half-open is valid!");
-	    }
-	}
-    }
-  return is_valid;
-}
-
 always_inline uword
 tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 		    vlib_frame_t * from_frame, int is_ip4)
@@ -2822,6 +2913,7 @@
 	  vlib_buffer_t *b0;
 	  tcp_header_t *tcp0 = 0;
 	  tcp_connection_t *tc0;
+	  transport_connection_t *tconn;
 	  ip4_header_t *ip40;
 	  ip6_header_t *ip60;
 	  u32 error0 = TCP_ERROR_NO_LISTENER, next0 = TCP_INPUT_NEXT_DROP;
@@ -2847,15 +2939,13 @@
 				  + tcp_header_bytes (tcp0));
 	      n_data_bytes0 = clib_net_to_host_u16 (ip40->length)
 		- n_advance_bytes0;
-
-	      tc0 =
-		(tcp_connection_t *)
-		stream_session_lookup_transport_wt4 (&ip40->dst_address,
-						     &ip40->src_address,
-						     tcp0->dst_port,
-						     tcp0->src_port,
-						     SESSION_TYPE_IP4_TCP,
-						     my_thread_index);
+	      tconn = stream_session_lookup_transport_wt4 (&ip40->dst_address,
+							   &ip40->src_address,
+							   tcp0->dst_port,
+							   tcp0->src_port,
+							   SESSION_TYPE_IP4_TCP,
+							   my_thread_index);
+	      tc0 = tcp_get_connection_from_transport (tconn);
 	      ASSERT (tcp_lookup_is_valid (tc0, tcp0));
 	    }
 	  else
@@ -2866,15 +2956,13 @@
 	      n_data_bytes0 = clib_net_to_host_u16 (ip60->payload_length)
 		- n_advance_bytes0;
 	      n_advance_bytes0 += sizeof (ip60[0]);
-
-	      tc0 =
-		(tcp_connection_t *)
-		stream_session_lookup_transport_wt6 (&ip60->dst_address,
-						     &ip60->src_address,
-						     tcp0->dst_port,
-						     tcp0->src_port,
-						     SESSION_TYPE_IP6_TCP,
-						     my_thread_index);
+	      tconn = stream_session_lookup_transport_wt6 (&ip60->dst_address,
+							   &ip60->src_address,
+							   tcp0->dst_port,
+							   tcp0->src_port,
+							   SESSION_TYPE_IP6_TCP,
+							   my_thread_index);
+	      tc0 = tcp_get_connection_from_transport (tconn);
 	      ASSERT (tcp_lookup_is_valid (tc0, tcp0));
 	    }
 
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index 15a9dcb..9cb3e77 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -66,11 +66,10 @@
 }
 
 static u8
-tcp_window_compute_scale (u32 available_space)
+tcp_window_compute_scale (u32 window)
 {
   u8 wnd_scale = 0;
-  while (wnd_scale < TCP_MAX_WND_SCALE
-	 && (available_space >> wnd_scale) > TCP_WND_MAX)
+  while (wnd_scale < TCP_MAX_WND_SCALE && (window >> wnd_scale) > TCP_WND_MAX)
     wnd_scale++;
   return wnd_scale;
 }
@@ -444,12 +443,10 @@
 
   vec_validate (tm->tx_buffers[thread_index],
 		current_length + n_free_buffers - 1);
-  _vec_len (tm->tx_buffers[thread_index]) =
-    current_length + vlib_buffer_alloc_from_free_list (vlib_get_main (),
-						       tm->tx_buffers
-						       [thread_index],
-						       n_free_buffers,
-						       VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+  _vec_len (tm->tx_buffers[thread_index]) = current_length
+    + vlib_buffer_alloc (vlib_get_main (),
+			 &tm->tx_buffers[thread_index][current_length],
+			 n_free_buffers);
   /* buffer shortage, report failure */
   if (vec_len (tm->tx_buffers[thread_index]) == 0)
     {
@@ -470,7 +467,7 @@
 	return -1;
     }
   my_tx_buffers = tm->tx_buffers[thread_index];
-  *bidx = my_tx_buffers[_vec_len (my_tx_buffers) - 1];
+  *bidx = my_tx_buffers[vec_len (my_tx_buffers) - 1];
   _vec_len (my_tx_buffers) -= 1;
   return 0;
 }
@@ -478,10 +475,7 @@
 always_inline void
 tcp_return_buffer (tcp_main_t * tm)
 {
-  u32 *my_tx_buffers;
-  u32 thread_index = vlib_get_thread_index ();
-  my_tx_buffers = tm->tx_buffers[thread_index];
-  _vec_len (my_tx_buffers) += 1;
+  _vec_len (tm->tx_buffers[vlib_get_thread_index ()]) += 1;
 }
 
 always_inline void *
@@ -489,7 +483,8 @@
 {
   if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
     vlib_buffer_free_one (vm, b->next_buffer);
-  b->flags = 0;
+  /* Zero all flags but free list index and trace flag */
+  b->flags &= VLIB_BUFFER_NEXT_PRESENT - 1;
   b->current_data = 0;
   b->current_length = 0;
   b->total_length_not_including_first_buffer = 0;
@@ -503,7 +498,8 @@
 tcp_init_buffer (vlib_main_t * vm, vlib_buffer_t * b)
 {
   ASSERT ((b->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
-  b->flags = VNET_BUFFER_F_LOCALLY_ORIGINATED;
+  b->flags &= VLIB_BUFFER_FREE_LIST_INDEX_MASK;
+  b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
   b->total_length_not_including_first_buffer = 0;
   vnet_buffer (b)->tcp.flags = 0;
 
@@ -567,8 +563,34 @@
 
   /* Reset flags, make sure ack is sent */
   vnet_buffer (b)->tcp.flags &= ~TCP_BUF_FLAG_DUPACK;
+}
 
-  tc->snd_nxt += 1;
+/**
+ * Convert buffer to SYN
+ */
+void
+tcp_make_syn (tcp_connection_t * tc, vlib_buffer_t * b)
+{
+  u8 tcp_hdr_opts_len, tcp_opts_len;
+  tcp_header_t *th;
+  u16 initial_wnd;
+  tcp_options_t snd_opts;
+
+  initial_wnd = tcp_initial_window_to_advertise (tc);
+
+  /* Make and write options */
+  memset (&snd_opts, 0, sizeof (snd_opts));
+  tcp_opts_len = tcp_make_syn_options (&snd_opts, tc->rcv_wscale);
+  tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t);
+
+  th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss,
+			     tc->rcv_nxt, tcp_hdr_opts_len, TCP_FLAG_SYN,
+			     initial_wnd);
+  vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
+  tcp_options_write ((u8 *) (th + 1), &snd_opts);
+
+  tcp_timer_update (tc, TCP_TIMER_RETRANSMIT_SYN,
+		    tc->rto * TCP_TO_TIMER_TICK);
 }
 
 /**
@@ -582,37 +604,25 @@
   u8 tcp_opts_len, tcp_hdr_opts_len;
   tcp_header_t *th;
   u16 initial_wnd;
-  u32 time_now;
 
   memset (snd_opts, 0, sizeof (*snd_opts));
-
   tcp_reuse_buffer (vm, b);
 
-  /* Set random initial sequence */
-  time_now = tcp_time_now ();
-
-  tc->iss = random_u32 (&time_now);
-  tc->snd_una = tc->iss;
-  tc->snd_nxt = tc->iss + 1;
-  tc->snd_una_max = tc->snd_nxt;
-
   initial_wnd = tcp_initial_window_to_advertise (tc);
-
-  /* Make and write options */
   tcp_opts_len = tcp_make_synack_options (tc, snd_opts);
   tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t);
 
   th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss,
 			     tc->rcv_nxt, tcp_hdr_opts_len,
 			     TCP_FLAG_SYN | TCP_FLAG_ACK, initial_wnd);
-
   tcp_options_write ((u8 *) (th + 1), snd_opts);
 
   vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
   vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK;
 
-  /* Init retransmit timer */
-  tcp_retransmit_timer_set (tc);
+  /* Init retransmit timer. Use update instead of set because of
+   * retransmissions */
+  tcp_retransmit_timer_force_update (tc);
   TCP_EVT_DBG (TCP_EVT_SYNACK_SENT, tc);
 }
 
@@ -918,44 +928,17 @@
   u32 bi;
   tcp_main_t *tm = vnet_get_tcp_main ();
   vlib_main_t *vm = vlib_get_main ();
-  u8 tcp_hdr_opts_len, tcp_opts_len;
-  tcp_header_t *th;
-  u32 time_now;
-  u16 initial_wnd;
-  tcp_options_t snd_opts;
 
   if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
     return;
 
   b = vlib_get_buffer (vm, bi);
   tcp_init_buffer (vm, b);
-
-  /* Set random initial sequence */
-  time_now = tcp_time_now ();
-
-  tc->iss = random_u32 (&time_now);
-  tc->snd_una = tc->iss;
-  tc->snd_una_max = tc->snd_nxt = tc->iss + 1;
-
-  initial_wnd = tcp_initial_window_to_advertise (tc);
-
-  /* Make and write options */
-  memset (&snd_opts, 0, sizeof (snd_opts));
-  tcp_opts_len = tcp_make_syn_options (&snd_opts, tc->rcv_wscale);
-  tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t);
-
-  th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss,
-			     tc->rcv_nxt, tcp_hdr_opts_len, TCP_FLAG_SYN,
-			     initial_wnd);
-
-  tcp_options_write ((u8 *) (th + 1), &snd_opts);
+  tcp_make_syn (tc, b);
 
   /* Measure RTT with this */
   tc->rtt_ts = tcp_time_now ();
   tc->rtt_seq = tc->snd_nxt;
-
-  /* Start retransmit trimer  */
-  tcp_timer_set (tc, TCP_TIMER_RETRANSMIT_SYN, tc->rto * TCP_TO_TIMER_TICK);
   tc->rto_boff = 0;
 
   /* Set the connection establishment timer */
@@ -1010,8 +993,12 @@
   /* buffer will be initialized by in tcp_make_fin */
   tcp_make_fin (tc, b);
   tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4);
-  tc->flags |= TCP_CONN_FINSNT;
-  tc->flags &= ~TCP_CONN_FINPNDG;
+  if (!(tc->flags & TCP_CONN_FINSNT))
+    {
+      tc->flags |= TCP_CONN_FINSNT;
+      tc->flags &= ~TCP_CONN_FINPNDG;
+      tc->snd_nxt += 1;
+    }
   tcp_retransmit_timer_force_update (tc);
   TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc);
 }
@@ -1146,6 +1133,7 @@
    * Make sure we can retransmit something
    */
   available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection);
+  ASSERT (available_bytes >= offset);
   available_bytes -= offset;
   if (!available_bytes)
     return 0;
@@ -1209,6 +1197,7 @@
 				    VLIB_FRAME_SIZE - available_bufs))
 	    {
 	      tcp_return_buffer (tm);
+	      *b = 0;
 	      return 0;
 	    }
 	}
@@ -1236,7 +1225,7 @@
 	  ASSERT (n_peeked == len_to_deq);
 	  n_bytes += n_peeked;
 	  chain_b->current_length = n_peeked;
-	  chain_b->flags = 0;
+	  chain_b->flags &= VLIB_BUFFER_FREE_LIST_INDEX_MASK;
 	  chain_b->next_buffer = 0;
 
 	  /* update previous buffer */
@@ -1310,19 +1299,6 @@
       tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID;
     }
 
-  if (!tcp_in_recovery (tc) && tc->rto_boff > 0
-      && tc->state >= TCP_STATE_ESTABLISHED)
-    {
-      tc->rto_boff = 0;
-      tcp_update_rto (tc);
-    }
-
-  /* Increment RTO backoff (also equal to number of retries) */
-  tc->rto_boff += 1;
-
-  /* Go back to first un-acked byte */
-  tc->snd_nxt = tc->snd_una;
-
   if (tc->state >= TCP_STATE_ESTABLISHED)
     {
       /* Lost FIN, retransmit and return */
@@ -1332,6 +1308,18 @@
 	  return;
 	}
 
+      /* We're not in recovery so make sure rto_boff is 0 */
+      if (!tcp_in_recovery (tc) && tc->rto_boff > 0)
+	{
+	  tc->rto_boff = 0;
+	  tcp_update_rto (tc);
+	}
+
+      /* Increment RTO backoff (also equal to number of retries) and go back
+       * to first un-acked byte  */
+      tc->rto_boff += 1;
+      tc->snd_nxt = tc->snd_una;
+
       /* First retransmit timeout */
       if (tc->rto_boff == 1)
 	tcp_rtx_timeout_cc (tc);
@@ -1349,12 +1337,11 @@
 
       if (n_bytes == 0)
 	{
-	  if (b)
-	    {
-	      clib_warning ("retransmit fail: %U", format_tcp_connection, tc,
-			    2);
-	      ASSERT (tc->rto_boff > 1 && tc->snd_una == tc->snd_congestion);
-	    }
+	  ASSERT (!b);
+	  if (tc->snd_una == tc->snd_una_max)
+	    return;
+	  ASSERT (tc->rto_boff > 1 && tc->snd_una == tc->snd_congestion);
+	  clib_warning ("retransmit fail: %U", format_tcp_connection, tc, 2);
 	  /* Try again eventually */
 	  tcp_retransmit_timer_set (tc);
 	  return;
@@ -1365,16 +1352,18 @@
       /* For first retransmit, record timestamp (Eifel detection RFC3522) */
       if (tc->rto_boff == 1)
 	tc->snd_rxt_ts = tcp_time_now ();
+
+      tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
+      tcp_retransmit_timer_update (tc);
     }
-  /* Retransmit for SYN/SYNACK */
-  else if (tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_SYN_SENT)
+  /* Retransmit for SYN */
+  else if (tc->state == TCP_STATE_SYN_SENT)
     {
       /* Half-open connection actually moved to established but we were
        * waiting for syn retransmit to pop to call cleanup from the right
        * thread. */
       if (tc->flags & TCP_CONN_HALF_OPEN_DONE)
 	{
-	  ASSERT (tc->state == TCP_STATE_SYN_SENT);
 	  if (tcp_half_open_connection_cleanup (tc))
 	    {
 	      clib_warning ("could not remove half-open connection");
@@ -1385,23 +1374,40 @@
 
       /* Try without increasing RTO a number of times. If this fails,
        * start growing RTO exponentially */
+      tc->rto_boff += 1;
       if (tc->rto_boff > TCP_RTO_SYN_RETRIES)
 	tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
 
       if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
-	{
-	  clib_warning ("tcp_get_free_buffer_index FAIL");
-	  return;
-	}
+	return;
+
       b = vlib_get_buffer (vm, bi);
       tcp_init_buffer (vm, b);
-      tcp_push_hdr_i (tc, b, tc->state, 1);
+      tcp_make_syn (tc, b);
 
-      /* Account for the SYN */
-      tc->snd_nxt += 1;
       tc->rtt_ts = 0;
-      TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc,
-		   (tc->state == TCP_STATE_SYN_SENT ? 0 : 1));
+      TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc, 0);
+
+      /* This goes straight to ipx_lookup. Retransmit timer set already */
+      tcp_push_ip_hdr (tm, tc, b);
+      tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4);
+    }
+  /* Retransmit SYN-ACK */
+  else if (tc->state == TCP_STATE_SYN_RCVD)
+    {
+      tc->rto_boff += 1;
+      tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
+      tc->rtt_ts = 0;
+
+      if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi)))
+	return;
+
+      b = vlib_get_buffer (vm, bi);
+      tcp_make_synack (tc, b);
+      TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc, 1);
+
+      /* Retransmit timer already updated, just enqueue to output */
+      tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
     }
   else
     {
@@ -1409,26 +1415,6 @@
       clib_warning ("connection closed ...");
       return;
     }
-
-  if (!is_syn)
-    {
-      tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
-
-      /* Re-enable retransmit timer */
-      tcp_retransmit_timer_set (tc);
-    }
-  else
-    {
-      ASSERT (tc->state == TCP_STATE_SYN_SENT);
-
-      /* This goes straight to ipx_lookup */
-      tcp_push_ip_hdr (tm, tc, b);
-      tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4);
-
-      /* Re-enable retransmit timer */
-      tcp_timer_set (tc, TCP_TIMER_RETRANSMIT_SYN,
-		     tc->rto * TCP_TO_TIMER_TICK);
-    }
 }
 
 void