tcp: improve rcv process ack processing

- Avoid doing cc in closing states.
- Rest connections closed with unread data

Change-Id: I97d46b0459f03ea5439eeb0f233b6c17d3e06dfd
Signed-off-by: Florin Coras <fcoras@cisco.com>
diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h
index a3b84a6..ed42e54 100644
--- a/src/vnet/session/session.h
+++ b/src/vnet/session/session.h
@@ -389,6 +389,13 @@
 }
 
 always_inline u32
+transport_max_rx_dequeue (transport_connection_t * tc)
+{
+  session_t *s = session_get (tc->s_index, tc->thread_index);
+  return svm_fifo_max_dequeue (s->rx_fifo);
+}
+
+always_inline u32
 transport_rx_fifo_size (transport_connection_t * tc)
 {
   session_t *s = session_get (tc->s_index, tc->thread_index);
diff --git a/src/vnet/session/transport.c b/src/vnet/session/transport.c
index abab086..d83ecfb 100644
--- a/src/vnet/session/transport.c
+++ b/src/vnet/session/transport.c
@@ -49,7 +49,7 @@
 
 #define TRANSPORT_PACER_MIN_MSS 	1460
 #define TRANSPORT_PACER_MIN_BURST 	TRANSPORT_PACER_MIN_MSS
-#define TRANSPORT_PACER_MAX_BURST	(48 * TRANSPORT_PACER_MIN_MSS)
+#define TRANSPORT_PACER_MAX_BURST	(32 * TRANSPORT_PACER_MIN_MSS)
 
 u8 *
 format_transport_proto (u8 * s, va_list * args)
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
index e262513..09c47d9 100644
--- a/src/vnet/tcp/tcp.c
+++ b/src/vnet/tcp/tcp.c
@@ -350,6 +350,15 @@
       tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_FINWAIT1_TIME);
       break;
     case TCP_STATE_ESTABLISHED:
+      /* If closing with unread data, reset the connection */
+      if (transport_max_rx_dequeue (&tc->connection))
+	{
+	  tcp_send_reset (tc);
+	  tcp_connection_timers_reset (tc);
+	  tcp_connection_set_state (tc, TCP_STATE_CLOSED);
+	  tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME);
+	  break;
+	}
       if (!transport_max_tx_dequeue (&tc->connection))
 	tcp_send_fin (tc);
       else
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index 8383d01..46d72b7 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -126,6 +126,7 @@
   _(FRXT_FIRST, "Fast-retransmit first again")	\
   _(DEQ_PENDING, "Pending dequeue acked")	\
   _(PSH_PENDING, "PSH pending")			\
+  _(FINRCVD, "FIN received")			\
 
 typedef enum _tcp_connection_flag_bits
 {
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index cc630f8..9ac2d85 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -405,11 +405,26 @@
 }
 
 always_inline int
-tcp_rcv_ack_is_acceptable (tcp_connection_t * tc0, vlib_buffer_t * tb0)
+tcp_rcv_ack_no_cc (tcp_connection_t * tc, vlib_buffer_t * b, u32 * error)
 {
   /* SND.UNA =< SEG.ACK =< SND.NXT */
-  return (seq_leq (tc0->snd_una, vnet_buffer (tb0)->tcp.ack_number)
-	  && seq_leq (vnet_buffer (tb0)->tcp.ack_number, tc0->snd_una_max));
+  if (!(seq_leq (tc->snd_una, vnet_buffer (b)->tcp.ack_number)
+	&& seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)))
+    {
+      if (seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_una_max))
+	{
+	  tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
+	  goto acceptable;
+	}
+      *error = TCP_ERROR_ACK_INVALID;
+      return -1;
+    }
+
+acceptable:
+  tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una;
+  tc->snd_una = vnet_buffer (b)->tcp.ack_number;
+  *error = TCP_ERROR_ACK_OK;
+  return 0;
 }
 
 /**
@@ -2703,24 +2718,24 @@
       switch (tc0->state)
 	{
 	case TCP_STATE_SYN_RCVD:
+
+	  /* Make sure the segment is exactly right */
+	  if (tc0->rcv_nxt != vnet_buffer (b0)->tcp.seq_number || is_fin0)
+	    {
+	      tcp_connection_reset (tc0);
+	      error0 = TCP_ERROR_SEGMENT_INVALID;
+	      goto drop;
+	    }
+
 	  /*
 	   * If the segment acknowledgment is not acceptable, form a
 	   * reset segment,
 	   *  <SEQ=SEG.ACK><CTL=RST>
 	   * and send it.
 	   */
-	  if (!tcp_rcv_ack_is_acceptable (tc0, b0))
+	  if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
 	    {
 	      tcp_connection_reset (tc0);
-	      error0 = TCP_ERROR_ACK_INVALID;
-	      goto drop;
-	    }
-
-	  /* Make sure the ack is exactly right */
-	  if (tc0->rcv_nxt != vnet_buffer (b0)->tcp.seq_number || is_fin0)
-	    {
-	      tcp_connection_reset (tc0);
-	      error0 = TCP_ERROR_SEGMENT_INVALID;
 	      goto drop;
 	    }
 
@@ -2774,12 +2789,22 @@
 	  /* If FIN is ACKed */
 	  else if (tc0->snd_una == tc0->snd_nxt)
 	    {
-	      tcp_connection_set_state (tc0, TCP_STATE_FIN_WAIT_2);
-
 	      /* Stop all retransmit timers because we have nothing more
-	       * to send. Enable waitclose though because we're willing to
-	       * wait for peer's FIN but not indefinitely. */
+	       * to send. */
 	      tcp_connection_timers_reset (tc0);
+
+	      /* We already have a FIN but didn't transition to CLOSING
+	       * because of outstanding tx data. Close the connection. */
+	      if (tc0->flags & TCP_CONN_FINRCVD)
+		{
+		  tcp_connection_set_state (tc0, TCP_STATE_CLOSED);
+		  tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME);
+		  goto drop;
+		}
+
+	      tcp_connection_set_state (tc0, TCP_STATE_FIN_WAIT_2);
+	      /* Enable waitclose because we're willing to wait for peer's
+	       * FIN but not indefinitely. */
 	      tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
 
 	      /* Don't try to deq the FIN acked */
@@ -2793,7 +2818,7 @@
 	  /* In addition to the processing for the ESTABLISHED state, if
 	   * the retransmission queue is empty, the user's CLOSE can be
 	   * acknowledged ("ok") but do not delete the TCB. */
-	  if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &error0))
+	  if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
 	    goto drop;
 	  tc0->burst_acked = 0;
 	  break;
@@ -2802,37 +2827,27 @@
 	  if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &error0))
 	    goto drop;
 
-	  if (tc0->flags & TCP_CONN_FINPNDG)
-	    {
-	      /* TX fifo finally drained */
-	      if (!transport_max_tx_dequeue (&tc0->connection))
-		{
-		  tcp_send_fin (tc0);
-		  tcp_connection_timers_reset (tc0);
-		  tcp_connection_set_state (tc0, TCP_STATE_LAST_ACK);
-		  tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
-		}
-	    }
+	  if (!(tc0->flags & TCP_CONN_FINPNDG))
+	    break;
+
+	  /* Still have outstanding tx data */
+	  if (transport_max_tx_dequeue (&tc0->connection))
+	    break;
+
+	  tcp_send_fin (tc0);
+	  tcp_connection_timers_reset (tc0);
+	  tcp_connection_set_state (tc0, TCP_STATE_LAST_ACK);
+	  tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
 	  break;
 	case TCP_STATE_CLOSING:
 	  /* In addition to the processing for the ESTABLISHED state, if
 	   * the ACK acknowledges our FIN then enter the TIME-WAIT state,
 	   * otherwise ignore the segment. */
-	  if (!tcp_rcv_ack_is_acceptable (tc0, b0))
-	    {
-	      error0 = TCP_ERROR_ACK_INVALID;
-	      goto drop;
-	    }
+	  if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
+	    goto drop;
 
-	  error0 = TCP_ERROR_ACK_OK;
-	  tc0->snd_una = vnet_buffer (b0)->tcp.ack_number;
-	  /* Ack moved snd_una beyond snd_nxt so reprogram fin */
-	  if (seq_gt (tc0->snd_una, tc0->snd_nxt))
-	    {
-	      tc0->snd_nxt = tc0->snd_una;
-	      tc0->flags &= ~TCP_CONN_FINSNT;
-	      goto drop;
-	    }
+	  if (tc0->snd_una != tc0->snd_nxt)
+	    goto drop;
 
 	  tcp_connection_timers_reset (tc0);
 	  tcp_connection_set_state (tc0, TCP_STATE_TIME_WAIT);
@@ -2845,13 +2860,9 @@
 	   * acknowledgment of our FIN. If our FIN is now acknowledged,
 	   * delete the TCB, enter the CLOSED state, and return. */
 
-	  if (!tcp_rcv_ack_is_acceptable (tc0, b0))
-	    {
-	      error0 = TCP_ERROR_ACK_INVALID;
-	      goto drop;
-	    }
-	  error0 = TCP_ERROR_ACK_OK;
-	  tc0->snd_una = vnet_buffer (b0)->tcp.ack_number;
+	  if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
+	    goto drop;
+
 	  /* Apparently our ACK for the peer's FIN was lost */
 	  if (is_fin0 && tc0->snd_una != tc0->snd_nxt)
 	    {
@@ -2875,7 +2886,7 @@
 	   * retransmission of the remote FIN. Acknowledge it, and restart
 	   * the 2 MSL timeout. */
 
-	  if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &error0))
+	  if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
 	    goto drop;
 
 	  if (!is_fin0)
@@ -2943,26 +2954,17 @@
 	  break;
 	case TCP_STATE_FIN_WAIT_1:
 	  tc0->rcv_nxt += 1;
-	  tcp_connection_set_state (tc0, TCP_STATE_CLOSING);
+	  /* If data is outstanding stay in FIN_WAIT_1 and try to finish
+	   * sending it. */
 	  if (tc0->flags & TCP_CONN_FINPNDG)
 	    {
-	      /* Drop all outstanding tx data. */
-	      session_tx_fifo_dequeue_drop (&tc0->connection,
-					    transport_max_tx_dequeue
-					    (&tc0->connection));
-	      /* Make it look as if we've recovered, if needed */
-	      if (tcp_in_cong_recovery (tc0))
-		{
-		  scoreboard_clear (&tc0->sack_sb);
-		  tcp_fastrecovery_off (tc0);
-		  tcp_recovery_off (tc0);
-		  tcp_connection_timers_reset (tc0);
-		  tc0->snd_nxt = tc0->snd_una;
-		}
-	      tcp_send_fin (tc0);
+	      tc0->flags |= TCP_CONN_FINRCVD;
 	    }
 	  else
-	    tcp_program_ack (wrk, tc0);
+	    {
+	      tcp_connection_set_state (tc0, TCP_STATE_CLOSING);
+	      tcp_program_ack (wrk, tc0);
+	    }
 	  /* Wait for ACK for our FIN but not forever */
 	  tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
 	  break;
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index 518a80d..03caa07 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -1132,7 +1132,6 @@
   tcp_push_hdr_i (tc, b, tc->snd_nxt, /* compute opts */ 0, /* burst */ 1,
 		  /* update_snd_nxt */ 1);
   tc->snd_una_max = seq_max (tc->snd_nxt, tc->snd_una_max);
-  ASSERT (seq_leq (tc->snd_una_max, tc->snd_una + tc->snd_wnd));
   tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
   /* If not tracking an ACK, start tracking */
   if (tc->rtt_ts == 0 && !tcp_in_cong_recovery (tc))