TCP cc/window management fixes and debugging

- added persist timer
- update rcv_las whenever sending an ack
- moved fifo size to its own cache line
- improved session and builtin client debugging

Change-Id: Ia649cf942cf0c061a713e8b67f0eb6974a6cd55b
Signed-off-by: Florin Coras <fcoras@cisco.com>
Signed-off-by: Dave Barach <dave@barachs.net>
diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c
index 07b0d2d..cc84feb 100644
--- a/src/svm/svm_fifo.c
+++ b/src/svm/svm_fifo.c
@@ -254,6 +254,10 @@
 {
   ooo_segment_t *s;
   u32 index, bytes = 0, diff;
+  u32 cursize;
+
+  /* read cursize, which can only increase while we're working */
+  cursize = svm_fifo_max_dequeue (f);
 
   s = pool_elt_at_index (f->ooo_segments, f->ooos_list_head);
 
@@ -286,8 +290,8 @@
   /* If tail is adjacent to an ooo segment, 'consume' it */
   if (diff == 0)
     {
-      bytes = ((f->nitems - f->cursize) >= s->length) ? s->length :
-	f->nitems - f->cursize;
+      bytes = ((f->nitems - cursize) >= s->length) ? s->length :
+	f->nitems - cursize;
 
       f->tail += bytes;
       f->tail %= f->nitems;
@@ -305,11 +309,12 @@
   u32 total_copy_bytes, first_copy_bytes, second_copy_bytes;
   u32 cursize, nitems;
 
-  if (PREDICT_FALSE (f->cursize == f->nitems))
+  /* read cursize, which can only increase while we're working */
+  cursize = svm_fifo_max_dequeue (f);
+
+  if (PREDICT_FALSE (cursize == f->nitems))
     return -2;			/* fifo stuffed */
 
-  /* read cursize, which can only decrease while we're working */
-  cursize = f->cursize;
   nitems = f->nitems;
 
   /* Number of bytes we're going to copy */
@@ -382,8 +387,8 @@
 
   ASSERT (offset > 0);
 
-  /* read cursize, which can only decrease while we're working */
-  cursize = f->cursize;
+  /* read cursize, which can only increase while we're working */
+  cursize = svm_fifo_max_dequeue (f);
   nitems = f->nitems;
 
   /* Will this request fit? */
@@ -437,11 +442,11 @@
   u32 total_copy_bytes, first_copy_bytes, second_copy_bytes;
   u32 cursize, nitems;
 
-  if (PREDICT_FALSE (f->cursize == 0))
+  /* read cursize, which can only increase while we're working */
+  cursize = svm_fifo_max_dequeue (f);
+  if (PREDICT_FALSE (cursize == 0))
     return -2;			/* nothing in the fifo */
 
-  /* read cursize, which can only increase while we're working */
-  cursize = f->cursize;
   nitems = f->nitems;
 
   /* Number of bytes we're going to copy */
@@ -495,11 +500,11 @@
   u32 total_copy_bytes, first_copy_bytes, second_copy_bytes;
   u32 cursize, nitems, real_head;
 
-  if (PREDICT_FALSE (f->cursize == 0))
+  /* read cursize, which can only increase while we're working */
+  cursize = svm_fifo_max_dequeue (f);
+  if (PREDICT_FALSE (cursize == 0))
     return -2;			/* nothing in the fifo */
 
-  /* read cursize, which can only increase while we're working */
-  cursize = f->cursize;
   nitems = f->nitems;
   real_head = f->head + offset;
   real_head = real_head >= nitems ? real_head - nitems : real_head;
@@ -532,11 +537,11 @@
   u32 total_drop_bytes, first_drop_bytes, second_drop_bytes;
   u32 cursize, nitems;
 
-  if (PREDICT_FALSE (f->cursize == 0))
+  /* read cursize, which can only increase while we're working */
+  cursize = svm_fifo_max_dequeue (f);
+  if (PREDICT_FALSE (cursize == 0))
     return -2;			/* nothing in the fifo */
 
-  /* read cursize, which can only increase while we're working */
-  cursize = f->cursize;
   nitems = f->nitems;
 
   /* Number of bytes we're going to drop */
diff --git a/src/svm/svm_fifo.h b/src/svm/svm_fifo.h
index 3955617..80e5b0f 100644
--- a/src/svm/svm_fifo.h
+++ b/src/svm/svm_fifo.h
@@ -44,14 +44,16 @@
 
 typedef struct
 {
+  volatile u32 cursize;		/**< current fifo size */
+  u32 nitems;
+    CLIB_CACHE_LINE_ALIGN_MARK (end_cursize);
+
   pthread_mutex_t mutex;	/* 8 bytes */
   pthread_cond_t condvar;	/* 8 bytes */
   svm_lock_tag_t tag;
 
-  volatile u32 cursize;		/**< current fifo size */
   volatile u8 has_event;	/**< non-zero if deq event exists */
   u32 owner_pid;
-  u32 nitems;
 
   /* Backpointers */
   u32 server_session_index;
@@ -105,7 +107,7 @@
 static inline u32
 svm_fifo_max_enqueue (svm_fifo_t * f)
 {
-  return f->nitems - f->cursize;
+  return f->nitems - svm_fifo_max_dequeue (f);
 }
 
 static inline u8
diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c
index 8681105..b86e87d 100644
--- a/src/vnet/session/node.c
+++ b/src/vnet/session/node.c
@@ -119,15 +119,20 @@
 
   /* Nothing to read return */
   if (max_dequeue0 == 0)
-    {
-      return 0;
-    }
+    return 0;
 
   /* Ensure we're not writing more than transport window allows */
-  max_len_to_snd0 = clib_min (max_dequeue0, snd_space0);
-
-  /* TODO check if transport is willing to send len_to_snd0
-   * bytes (Nagle) */
+  if (max_dequeue0 < snd_space0)
+    {
+      /* Constrained by tx queue. Try to send only fully formed segments */
+      max_len_to_snd0 = (max_dequeue0 > snd_mss0) ?
+	max_dequeue0 - max_dequeue0 % snd_mss0 : max_dequeue0;
+      /* TODO Nagle ? */
+    }
+  else
+    {
+      max_len_to_snd0 = snd_space0;
+    }
 
   n_frame_bytes = snd_mss0 * VLIB_FRAME_SIZE;
   n_frames_per_evt = ceil ((double) max_len_to_snd0 / n_frame_bytes);
@@ -308,11 +313,14 @@
   int n_tx_packets = 0;
   u32 my_thread_index = vm->cpu_index;
   int i, rv;
+  f64 now = vlib_time_now (vm);
+
+  SESSION_EVT_DBG (SESSION_EVT_POLL_GAP_TRACK, smm, my_thread_index);
 
   /*
    *  Update TCP time
    */
-  tcp_update_time (vlib_time_now (vm), my_thread_index);
+  tcp_update_time (now, my_thread_index);
 
   /*
    * Get vpp queue events
diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c
index f10918a..8e2b261 100644
--- a/src/vnet/session/session.c
+++ b/src/vnet/session/session.c
@@ -556,7 +556,7 @@
 					u8 * added_a_segment)
 {
   svm_fifo_segment_private_t *fifo_segment;
-  u32 fifo_size, default_fifo_size = 128 << 10;	/* TODO config */
+  u32 fifo_size, default_fifo_size = 1 << 16;	/* TODO config */
   int i;
 
   *added_a_segment = 0;
@@ -1293,6 +1293,10 @@
   vec_validate (smm->current_enqueue_epoch, num_threads - 1);
   vec_validate (smm->vpp_event_queues, num_threads - 1);
 
+#if SESSION_DBG
+  vec_validate (smm->last_event_poll_by_thread, num_threads - 1);
+#endif
+
   /* $$$$ preallocate hack config parameter */
   for (i = 0; i < 200000; i++)
     {
diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h
index a39bc06..6878b4d 100644
--- a/src/vnet/session/session.h
+++ b/src/vnet/session/session.h
@@ -20,6 +20,7 @@
 #include <vlibmemory/api.h>
 #include <vppinfra/sparse_vec.h>
 #include <svm/svm_fifo_segment.h>
+#include <vnet/session/session_debug.h>
 
 #define HALF_OPEN_LOOKUP_INVALID_VALUE ((u64)~0)
 #define INVALID_INDEX ((u32)~0)
@@ -36,7 +37,7 @@
   FIFO_EVENT_BUILTIN_RX
 } fifo_event_type_t;
 
-#define foreach_session_input_error                                         \
+#define foreach_session_input_error                                    	\
 _(NO_SESSION, "No session drops")                                       \
 _(NO_LISTENER, "No listener for dst port drops")                        \
 _(ENQUEUED, "Packets pushed into rx fifo")                              \
@@ -218,6 +219,15 @@
   /* Convenience */
   vlib_main_t *vlib_main;
   vnet_main_t *vnet_main;
+
+#if SESSION_DBG
+  /**
+   * last event poll time by thread
+   * Debug only. Will cause false cache-line sharing as-is
+   */
+  f64 *last_event_poll_by_thread;
+#endif
+
 };
 
 extern session_manager_main_t session_manager_main;
diff --git a/src/vnet/session/session_debug.h b/src/vnet/session/session_debug.h
index 80a97cd..eb11f1a 100644
--- a/src/vnet/session/session_debug.h
+++ b/src/vnet/session/session_debug.h
@@ -16,13 +16,13 @@
 #define SRC_VNET_SESSION_SESSION_DEBUG_H_
 
 #include <vnet/session/transport.h>
-#include <vnet/session/session.h>
 #include <vlib/vlib.h>
 
 #define foreach_session_dbg_evt		\
   _(ENQ, "enqueue")			\
   _(DEQ, "dequeue")			\
-  _(DEQ_NODE, "dequeue")
+  _(DEQ_NODE, "dequeue")		\
+  _(POLL_GAP_TRACK, "poll gap track")	\
 
 typedef enum _session_evt_dbg
 {
@@ -33,6 +33,7 @@
 
 #define SESSION_DBG (0)
 #define SESSION_DEQ_NODE_EVTS (0)
+#define SESSION_EVT_POLL_DBG (1)
 
 #if TRANSPORT_DEBUG && SESSION_DBG
 
@@ -97,9 +98,34 @@
 #define SESSION_EVT_DEQ_NODE_HANDLER(_node_evt)
 #endif
 
+#if SESSION_DBG && SESSION_EVT_POLL_DBG
+#define SESSION_EVT_POLL_GAP(_smm, _my_thread_index)			\
+{									\
+  ELOG_TYPE_DECLARE (_e) =						\
+  {									\
+    .format = "nixon-gap: %d MS",					\
+    .format_args = "i4",						\
+  };									\
+  DEC_SESSION_ED(_e, 1);						\
+  ed->data[0] =	(u32) ((now -						\
+    _smm->last_event_poll_by_thread[my_thread_index])*1000.0);		\
+}
+#define SESSION_EVT_POLL_GAP_TRACK_HANDLER(_smm, _my_thread_index)	\
+{									\
+  if (PREDICT_TRUE(							\
+	      smm->last_event_poll_by_thread[my_thread_index] != 0.0))	\
+    if (now > smm->last_event_poll_by_thread[_my_thread_index] + 500e-6)\
+	SESSION_EVT_POLL_GAP(smm, my_thread_index);			\
+  _smm->last_event_poll_by_thread[my_thread_index] = now;		\
+}
+
+#else
+#define SESSION_EVT_POLL_GAP(_smm, _my_thread_index)
+#define SESSION_EVT_POLL_GAP_TRACK_HANDLER(_smm, _my_thread_index)
+#endif
+
 #define CONCAT_HELPER(_a, _b) _a##_b
 #define CC(_a, _b) CONCAT_HELPER(_a, _b)
-
 #define SESSION_EVT_DBG(_evt, _args...) CC(_evt, _HANDLER)(_args)
 
 #else
diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c
index 83cdbc1..e370506 100644
--- a/src/vnet/tcp/builtin_client.c
+++ b/src/vnet/tcp/builtin_client.c
@@ -43,6 +43,10 @@
 #include <vpp/api/vpe_all_api_h.h>
 #undef vl_printfun
 
+#define TCP_BUILTIN_CLIENT_DBG (1)
+#define TCP_BUILTIN_CLIENT_VPP_THREAD (0)
+#define TCP_BUILTIN_CLIENT_PTHREAD (!TCP_BUILTIN_CLIENT_VPP_THREAD)
+
 static void
 send_test_chunk (tclient_main_t * tm, session_t * s)
 {
@@ -52,35 +56,50 @@
   session_fifo_event_t evt;
   static int serial_number = 0;
   int rv;
+  test_buf_offset = s->bytes_sent % vec_len (test_data);
+  bytes_this_chunk = vec_len (test_data) - test_buf_offset;
 
-  while (s->bytes_to_send > 0)
+  bytes_this_chunk = bytes_this_chunk < s->bytes_to_send
+    ? bytes_this_chunk : s->bytes_to_send;
+
+  rv = svm_fifo_enqueue_nowait (s->server_tx_fifo, 0 /*pid */ ,
+				bytes_this_chunk,
+				test_data + test_buf_offset);
+
+  /* If we managed to enqueue data... */
+  if (rv > 0)
     {
-
-      test_buf_offset = s->bytes_sent % vec_len (test_data);
-      bytes_this_chunk = vec_len (test_data) - test_buf_offset;
-
-      bytes_this_chunk = bytes_this_chunk < s->bytes_to_send
-	? bytes_this_chunk : s->bytes_to_send;
-
-      rv = svm_fifo_enqueue_nowait (s->server_tx_fifo, 0 /*pid */ ,
-				    bytes_this_chunk,
-				    test_data + test_buf_offset);
-
-      if (rv > 0)
+      if (TCP_BUILTIN_CLIENT_DBG)
 	{
-	  s->bytes_to_send -= rv;
-	  s->bytes_sent += rv;
+          /* *INDENT-OFF* */
+          ELOG_TYPE_DECLARE (e) =
+            {
+              .format = "tx-enq: %d bytes",
+              .format_args = "i4",
+            };
+          /* *INDENT-ON* */
+	  struct
+	  {
+	    u32 data[1];
+	  } *ed;
+	  ed = ELOG_DATA (&vlib_global_main.elog_main, e);
+	  ed->data[0] = rv;
+	}
 
-	  if (svm_fifo_set_event (s->server_tx_fifo))
-	    {
-	      /* Fabricate TX event, send to vpp */
-	      evt.fifo = s->server_tx_fifo;
-	      evt.event_type = FIFO_EVENT_SERVER_TX;
-	      evt.event_id = serial_number++;
+      /* Account for it... */
+      s->bytes_to_send -= rv;
+      s->bytes_sent += rv;
 
-	      unix_shared_memory_queue_add (tm->vpp_event_queue, (u8 *) & evt,
-					    0 /* do wait for mutex */ );
-	    }
+      /* Poke the TCP state machine */
+      if (svm_fifo_set_event (s->server_tx_fifo))
+	{
+	  /* Fabricate TX event, send to vpp */
+	  evt.fifo = s->server_tx_fifo;
+	  evt.event_type = FIFO_EVENT_SERVER_TX;
+	  evt.event_id = serial_number++;
+
+	  unix_shared_memory_queue_add (tm->vpp_event_queue, (u8 *) & evt,
+					0 /* do wait for mutex */ );
 	}
     }
 }
@@ -89,39 +108,55 @@
 receive_test_chunk (tclient_main_t * tm, session_t * s)
 {
   svm_fifo_t *rx_fifo = s->server_rx_fifo;
-  int n_read, bytes, i;
+  int n_read, test_bytes = 0;
 
-  bytes = svm_fifo_max_dequeue (rx_fifo);
   /* Allow enqueuing of new event */
-  svm_fifo_unset_event (rx_fifo);
+  // svm_fifo_unset_event (rx_fifo);
 
-  /* Read the bytes */
-  do
+  n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len (tm->rx_buf),
+				    tm->rx_buf);
+  if (n_read > 0)
     {
-      n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len (tm->rx_buf),
-					tm->rx_buf);
-      if (n_read > 0)
+      if (TCP_BUILTIN_CLIENT_DBG)
 	{
-	  bytes -= n_read;
+          /* *INDENT-OFF* */
+          ELOG_TYPE_DECLARE (e) =
+            {
+              .format = "rx-deq: %d bytes",
+              .format_args = "i4",
+            };
+          /* *INDENT-ON* */
+	  struct
+	  {
+	    u32 data[1];
+	  } *ed;
+	  ed = ELOG_DATA (&vlib_global_main.elog_main, e);
+	  ed->data[0] = n_read;
+	}
+
+      if (test_bytes)
+	{
+	  int i;
 	  for (i = 0; i < n_read; i++)
 	    {
 	      if (tm->rx_buf[i] != ((s->bytes_received + i) & 0xff))
 		{
 		  clib_warning ("read %d error at byte %lld, 0x%x not 0x%x",
-				n_read, s->bytes_received + i,
-				tm->rx_buf[i],
+				n_read, s->bytes_received + i, tm->rx_buf[i],
 				((s->bytes_received + i) & 0xff));
 		}
 	    }
-	  s->bytes_to_receive -= n_read;
-	  s->bytes_received += n_read;
 	}
-
+      s->bytes_to_receive -= n_read;
+      s->bytes_received += n_read;
     }
-  while (n_read < 0 || bytes > 0);
 }
 
+#if TCP_BUILTIN_CLIENT_VPP_THREAD
+static void
+#else
 static void *
+#endif
 tclient_thread_fn (void *arg)
 {
   tclient_main_t *tm = &tclient_main;
@@ -139,6 +174,8 @@
     pthread_sigmask (SIG_SETMASK, &s, 0);
   }
 
+  clib_per_cpu_mheaps[os_get_cpu_number ()] = clib_per_cpu_mheaps[0];
+
   while (1)
     {
       /* Wait until we're told to get busy */
@@ -186,12 +223,12 @@
 
       /* Disconnect sessions... */
       vec_reset_length (session_indices);
-      pool_foreach (sp, tm->sessions, (
-					{
-					vec_add1 (session_indices,
-						  sp - tm->sessions);
-					}
-		    ));
+
+      /* *INDENT-OFF* */
+      pool_foreach (sp, tm->sessions, ({
+	vec_add1 (session_indices, sp - tm->sessions);
+      }));
+      /* *INDENT-ON* */
 
       for (i = 0; i < vec_len (session_indices); i++)
 	{
@@ -207,7 +244,9 @@
 	}
     }
   /* NOTREACHED */
+#if TCP_BUILTIN_CLIENT_PTHREAD
   return 0;
+#endif
 }
 
 /* So we don't get "no handler for... " msgs */
@@ -333,7 +372,7 @@
 			     unformat_input_t * input,
 			     vlib_cli_command_t * cmd)
 {
-  u8 *connect_uri = (u8 *) "tcp://6.0.1.2/1234";
+  u8 *connect_uri = (u8 *) "tcp://6.0.1.1/1234";
   u8 *uri;
   tclient_main_t *tm = &tclient_main;
   int i;
@@ -349,7 +388,7 @@
 	;
       else if (unformat (input, "iterations %d", &tm->n_iterations))
 	;
-      else if (unformat (input, "bytes %d", &tm->bytes_to_send))
+      else if (unformat (input, "bytes %lld", &tm->bytes_to_send))
 	;
       else if (unformat (input, "uri %s", &tm->connect_uri))
 	;
@@ -366,17 +405,20 @@
 
   create_api_loopback (tm);
 
+#if TCP_BUILTIN_CLIENT_PTHREAD
   /* Start a transmit thread */
   if (tm->client_thread_handle == 0)
     {
       int rv = pthread_create (&tm->client_thread_handle,
-			       NULL /*attr */ , tclient_thread_fn, 0);
+			       NULL /*attr */ ,
+			       tclient_thread_fn, 0);
       if (rv)
 	{
 	  tm->client_thread_handle = 0;
 	  return clib_error_return (0, "pthread_create returned %d", rv);
 	}
     }
+#endif
 
   /* Fire off connect requests, in something approaching a normal manner */
   for (i = 0; i < n_clients; i++)
@@ -397,6 +439,18 @@
   return 0;
 }
 
+#if TCP_BUILTIN_CLIENT_VPP_THREAD
+/* *INDENT-OFF* */
+VLIB_REGISTER_THREAD (builtin_client_reg, static) = {
+  .name = "tcp-builtin-client",
+  .function = tclient_thread_fn,
+  .fixed_count = 1,
+  .count = 1,
+  .no_data_structure_clone = 1,
+};
+/* *INDENT-ON* */
+#endif
+
 /* *INDENT-OFF* */
 VLIB_CLI_COMMAND (test_clients_command, static) =
 {
diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c
index efd26e9..917d4bd 100644
--- a/src/vnet/tcp/builtin_server.c
+++ b/src/vnet/tcp/builtin_server.c
@@ -127,6 +127,7 @@
     {
       /* XXX timeout for session that are stuck */
 
+    rx_event:
       /* Program self-tap to retry */
       if (svm_fifo_set_event (rx_fifo))
 	{
@@ -158,7 +159,9 @@
 
   n_written =
     svm_fifo_enqueue_nowait (tx_fifo, 0, actual_transfer, bsm->rx_buf);
-  ASSERT (n_written == max_transfer);
+
+  if (n_written != max_transfer)
+    clib_warning ("short trout!");
 
   if (svm_fifo_set_event (tx_fifo))
     {
@@ -171,6 +174,9 @@
 				    (u8 *) & evt, 0 /* do wait for mutex */ );
     }
 
+  if (PREDICT_FALSE (max_enqueue < max_dequeue))
+    goto rx_event;
+
   return 0;
 }
 
@@ -204,8 +210,8 @@
   a->session_cb_vft = &builtin_session_cb_vft;
   a->options = options;
   a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 128 << 20;
-  a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 64 << 10;
-  a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 64 << 10;
+  a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 1 << 16;
+  a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 1 << 16;
   a->segment_name = segment_name;
   a->segment_name_length = ARRAY_LEN (segment_name);
 
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
index c3df5bc..b2a371e 100644
--- a/src/vnet/tcp/tcp.c
+++ b/src/vnet/tcp/tcp.c
@@ -578,7 +578,9 @@
       /* If we can't write at least a segment, don't try at all */
       if (snd_space < tc->snd_mss)
 	return 0;
-      return snd_space;
+
+      /* round down to mss multiple */
+      return snd_space - (snd_space % tc->snd_mss);
     }
 
   /* If in fast recovery, send 1 SMSS if wnd allows */
@@ -706,7 +708,7 @@
 {
     tcp_timer_retransmit_handler,
     tcp_timer_delack_handler,
-    0,
+    tcp_timer_persist_handler,
     tcp_timer_keep_handler,
     tcp_timer_waitclose_handler,
     tcp_timer_retransmit_syn_handler,
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index b4286bc..2f5da10 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -81,6 +81,7 @@
 
 extern timer_expiration_handler tcp_timer_delack_handler;
 extern timer_expiration_handler tcp_timer_retransmit_handler;
+extern timer_expiration_handler tcp_timer_persist_handler;
 extern timer_expiration_handler tcp_timer_retransmit_syn_handler;
 
 #define TCP_TIMER_HANDLE_INVALID ((u32) ~0)
@@ -253,13 +254,25 @@
 
 #define tcp_fastrecovery_on(tc) (tc)->flags |= TCP_CONN_FAST_RECOVERY
 #define tcp_fastrecovery_off(tc) (tc)->flags &= ~TCP_CONN_FAST_RECOVERY
+#define tcp_recovery_on(tc) (tc)->flags |= TCP_CONN_RECOVERY
+#define tcp_recovery_off(tc) (tc)->flags &= ~TCP_CONN_RECOVERY
 #define tcp_in_fastrecovery(tc) ((tc)->flags & TCP_CONN_FAST_RECOVERY)
-#define tcp_in_recovery(tc) ((tc)->flags & (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY))
+#define tcp_in_recovery(tc) ((tc)->flags & (TCP_CONN_RECOVERY))
 #define tcp_in_slowstart(tc) (tc->cwnd < tc->ssthresh)
 #define tcp_fastrecovery_sent_1_smss(tc) ((tc)->flags & TCP_CONN_FR_1_SMSS)
 #define tcp_fastrecovery_1_smss_on(tc) ((tc)->flags |= TCP_CONN_FR_1_SMSS)
 #define tcp_fastrecovery_1_smss_off(tc) ((tc)->flags &= ~TCP_CONN_FR_1_SMSS)
 
+#define tcp_in_cong_recovery(tc) ((tc)->flags & 		\
+	  (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY))
+
+always_inline void
+tcp_cong_recovery_off (tcp_connection_t * tc)
+{
+  tc->flags &= ~(TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY);
+  tcp_fastrecovery_1_smss_off (tc);
+}
+
 typedef enum
 {
   TCP_IP4,
@@ -538,6 +551,27 @@
   tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT);
 }
 
+always_inline void
+tcp_persist_timer_set (tcp_connection_t * tc)
+{
+  /* Reuse RTO. It's backed off in handler */
+  tcp_timer_set (tc, TCP_TIMER_PERSIST,
+		 clib_max (tc->rto * TCP_TO_TIMER_TICK, 1));
+}
+
+always_inline void
+tcp_persist_timer_update (tcp_connection_t * tc)
+{
+  tcp_timer_update (tc, TCP_TIMER_PERSIST,
+		    clib_max (tc->rto * TCP_TO_TIMER_TICK, 1));
+}
+
+always_inline void
+tcp_persist_timer_reset (tcp_connection_t * tc)
+{
+  tcp_timer_reset (tc, TCP_TIMER_PERSIST);
+}
+
 always_inline u8
 tcp_timer_is_active (tcp_connection_t * tc, tcp_timers_e timer)
 {
diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h
index 5a71694..0090e15 100644
--- a/src/vnet/tcp/tcp_debug.h
+++ b/src/vnet/tcp/tcp_debug.h
@@ -31,6 +31,7 @@
   _(UNBIND, "unbind")			\
   _(DELETE, "delete")			\
   _(SYN_SENT, "SYN sent")		\
+  _(SYN_RTX, "SYN retransmit")		\
   _(FIN_SENT, "FIN sent")		\
   _(ACK_SENT, "ACK sent")		\
   _(DUPACK_SENT, "DUPACK sent")		\
@@ -50,6 +51,7 @@
   _(CC_PACK, "cc partial ack")		\
   _(SEG_INVALID, "invalid segment")	\
   _(ACK_RCV_ERR, "invalid ack")		\
+  _(RCV_WND_SHRUNK, "shrunk rcv_wnd")	\
 
 typedef enum _tcp_dbg
 {
@@ -159,35 +161,48 @@
 {									\
   ELOG_TYPE_DECLARE (_e) =						\
   {									\
-    .format = "ack_prep: acked %u rcv_nxt %u rcv_wnd %u snd_nxt %u",	\
-    .format_args = "i4i4i4i4",						\
+    .format = "ack_tx: acked %u rcv_nxt %u rcv_wnd %u snd_nxt %u snd_wnd %u",\
+    .format_args = "i4i4i4i4i4",					\
   };									\
-  DECLARE_ETD(_tc, _e, 4);						\
+  DECLARE_ETD(_tc, _e, 5);						\
   ed->data[0] = _tc->rcv_nxt - _tc->rcv_las;				\
   ed->data[1] = _tc->rcv_nxt - _tc->irs;				\
   ed->data[2] = _tc->rcv_wnd;						\
   ed->data[3] = _tc->snd_nxt - _tc->iss;				\
+  ed->data[4] = _tc->snd_wnd;						\
 }
 
 #define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...)				\
 {									\
   ELOG_TYPE_DECLARE (_e) =						\
   {									\
-    .format = "dack_tx: rcv_nxt %u rcv_wnd %u snd_nxt %u av-wnd %u",	\
-    .format_args = "i4i4i4i4",						\
+    .format = "dack_tx: rcv_nxt %u rcv_wnd %u snd_nxt %u av_wnd %u snd_wnd %u",\
+    .format_args = "i4i4i4i4i4",					\
   };									\
-  DECLARE_ETD(_tc, _e, 4);						\
+  DECLARE_ETD(_tc, _e, 5);						\
   ed->data[0] = _tc->rcv_nxt - _tc->irs;				\
   ed->data[1] = _tc->rcv_wnd;						\
   ed->data[2] = _tc->snd_nxt - _tc->iss;				\
   ed->data[3] = tcp_available_wnd(_tc);					\
+  ed->data[4] = _tc->snd_wnd;						\
 }
 
 #define TCP_EVT_SYN_SENT_HANDLER(_tc, ...)				\
 {									\
   ELOG_TYPE_DECLARE (_e) =						\
   {									\
-    .format = "SYNtx: iss %u",					\
+    .format = "SYNtx: iss %u",						\
+    .format_args = "i4",						\
+  };									\
+  DECLARE_ETD(_tc, _e, 1);						\
+  ed->data[0] = _tc->iss;						\
+}
+
+#define TCP_EVT_SYN_RTX_HANDLER(_tc, ...)				\
+{									\
+  ELOG_TYPE_DECLARE (_e) =						\
+  {									\
+    .format = "SYNrtx: iss %u",						\
     .format_args = "i4",						\
   };									\
   DECLARE_ETD(_tc, _e, 1);						\
@@ -254,17 +269,17 @@
   ed->data[1] = _tc->rcv_nxt - _tc->irs;				\
 }
 
-#define TCP_EVT_ACK_RCVD_HANDLER(_tc, _ack, ...)			\
+#define TCP_EVT_ACK_RCVD_HANDLER(_tc, ...)				\
 {									\
   ELOG_TYPE_DECLARE (_e) =						\
   {									\
-    .format = "acked: %u snd_una %u ack %u cwnd %u inflight %u",	\
+    .format = "acked: %u snd_una %u snd_wnd %u cwnd %u inflight %u",	\
     .format_args = "i4i4i4i4i4",					\
   };									\
   DECLARE_ETD(_tc, _e, 5);						\
   ed->data[0] = _tc->bytes_acked;					\
   ed->data[1] = _tc->snd_una - _tc->iss;				\
-  ed->data[2] = _ack - _tc->iss;					\
+  ed->data[2] = _tc->snd_wnd;						\
   ed->data[3] = _tc->cwnd;						\
   ed->data[4] = tcp_flight_size(_tc);					\
 }
@@ -273,14 +288,15 @@
 {									\
   ELOG_TYPE_DECLARE (_e) =						\
   {									\
-    .format = "dack_rx: snd_una %u cwnd %u snd_wnd %u inflight %u",	\
-    .format_args = "i4i4i4i4",						\
+    .format = "dack_rx: snd_una %u cwnd %u snd_wnd %u flight %u rcv_wnd %u",\
+    .format_args = "i4i4i4i4i4",					\
   };									\
-  DECLARE_ETD(_tc, _e, 4);						\
+  DECLARE_ETD(_tc, _e, 5);						\
   ed->data[0] = _tc->snd_una - _tc->iss;				\
   ed->data[1] = _tc->cwnd;						\
   ed->data[2] = _tc->snd_wnd;						\
   ed->data[3] = tcp_flight_size(_tc);					\
+  ed->data[4] = _tc->rcv_wnd;						\
 }
 
 #define TCP_EVT_PKTIZE_HANDLER(_tc, ...)				\
@@ -302,7 +318,7 @@
 {									\
   ELOG_TYPE_DECLARE (_e) =						\
   {									\
-    .format = "in: %s len %u written %d rcv_nxt %u free wnd %d",	\
+    .format = "in: %s len %u written %d rcv_nxt %u rcv_wnd(o) %d",	\
     .format_args = "t4i4i4i4i4",					\
     .n_enum_strings = 2,                                        	\
     .enum_strings = {                                           	\
@@ -338,7 +354,7 @@
     .enum_strings = {                                           	\
       "retransmit",                                             	\
       "delack",                                                 	\
-      "BUG",                                                    	\
+      "persist",                                                    	\
       "keep",                                                   	\
       "waitclose",                                              	\
       "retransmit syn",                                         	\
@@ -354,7 +370,7 @@
 {									\
   ELOG_TYPE_DECLARE (_e) =						\
   {									\
-    .format = "seg-inv: seq %u end %u rcv_las %u rcv_nxt %u wnd %u",	\
+    .format = "seg-inv: seq %u end %u rcv_las %u rcv_nxt %u rcv_wnd %u",\
     .format_args = "i4i4i4i4i4",					\
   };									\
   DECLARE_ETD(_tc, _e, 5);						\
@@ -445,6 +461,24 @@
 #define TCP_EVT_CC_PACK_HANDLER(_tc, ...)
 #endif
 
+#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...)		\
+{									\
+if (_av > 0) 								\
+{									\
+  ELOG_TYPE_DECLARE (_e) =						\
+  {									\
+    .format = "huh?: rcv_wnd %u obsd %u av %u rcv_nxt %u rcv_las %u",	\
+    .format_args = "i4i4i4i4i4",					\
+  };									\
+  DECLARE_ETD(_tc, _e, 5);						\
+  ed->data[0] = _tc->rcv_wnd;						\
+  ed->data[1] = _obs;							\
+  ed->data[2] = _av;							\
+  ed->data[3] = _tc->rcv_nxt - _tc->irs;				\
+  ed->data[4] = _tc->rcv_las - _tc->irs;				\
+}									\
+}
+
 #if TCP_DBG_VERBOSE
 #define TCP_EVT_SND_WND_HANDLER(_tc, ...)				\
 {									\
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index 5d11985f..a8224dc 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -276,8 +276,7 @@
       if (tc0->rcv_wnd == 0
 	  && tc0->rcv_nxt == vnet_buffer (b0)->tcp.seq_number)
 	{
-	  /* Make it look as if there's nothing to dequeue */
-	  vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number;
+	  /* TODO Should segment be tagged?  */
 	}
       else
 	{
@@ -375,7 +374,6 @@
   if (tc->rtt_seq && seq_gt (ack, tc->rtt_seq) && !tc->rto_boff)
     {
       mrtt = tcp_time_now () - tc->rtt_ts;
-      tc->rtt_seq = 0;
     }
 
   /* As per RFC7323 TSecr can be used for RTTM only if the segment advances
@@ -395,6 +393,10 @@
 
   tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX);
 
+  /* Allow measuring of RTT and make sure boff is 0 */
+  tc->rtt_seq = 0;
+  tc->rto_boff = 0;
+
   return 1;
 }
 
@@ -408,11 +410,7 @@
   stream_session_dequeue_drop (&tc->connection, tc->bytes_acked);
 
   /* Update rtt and rto */
-  if (tcp_update_rtt (tc, ack))
-    {
-      /* Good ACK received and valid RTT, make sure retransmit backoff is 0 */
-      tc->rto_boff = 0;
-    }
+  tcp_update_rtt (tc, ack);
 }
 
 /**
@@ -672,6 +670,13 @@
       tc->snd_wl1 = seq;
       tc->snd_wl2 = ack;
       TCP_EVT_DBG (TCP_EVT_SND_WND, tc);
+
+      /* Set probe timer if we just got 0 wnd */
+      if (tc->snd_wnd < tc->snd_mss
+	  && !tcp_timer_is_active (tc, TCP_TIMER_PERSIST))
+	tcp_persist_timer_set (tc);
+      else
+	tcp_persist_timer_reset (tc);
     }
 }
 
@@ -686,6 +691,10 @@
 void
 tcp_cc_recover (tcp_connection_t * tc)
 {
+  /* TODO: check if time to recover was small. It might be that RTO popped
+   * too soon.
+   */
+
   tc->cc_algo->recovered (tc);
 
   tc->rtx_bytes = 0;
@@ -695,8 +704,7 @@
   tc->cc_algo->rcv_ack (tc);
   tc->tsecr_last_ack = tc->opt.tsecr;
 
-  tcp_fastrecovery_1_smss_off (tc);
-  tcp_fastrecovery_off (tc);
+  tcp_cong_recovery_off (tc);
 
   TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3);
 }
@@ -706,7 +714,7 @@
 {
   u8 partial_ack;
 
-  if (tcp_in_recovery (tc))
+  if (tcp_in_cong_recovery (tc))
     {
       partial_ack = seq_lt (tc->snd_una, tc->snd_congestion);
       if (!partial_ack)
@@ -724,10 +732,10 @@
 
 	  /* In case snd_nxt is still in the past and output tries to
 	   * shove some new bytes */
-	  tc->snd_nxt = tc->snd_una;
+	  tc->snd_nxt = tc->snd_una_max;
 
 	  /* XXX need proper RFC6675 support */
-	  if (tc->sack_sb.last_sacked_bytes)
+	  if (tc->sack_sb.last_sacked_bytes && !tcp_in_recovery (tc))
 	    {
 	      tcp_fast_retransmit (tc);
 	    }
@@ -735,9 +743,6 @@
 	    {
 	      /* Retransmit first unacked segment */
 	      tcp_retransmit_first_unacked (tc);
-	      /* If window allows, send 1 SMSS of new data */
-	      if (seq_lt (tc->snd_nxt, tc->snd_congestion))
-		tc->snd_nxt = tc->snd_congestion;
 	    }
 	}
     }
@@ -814,10 +819,11 @@
 	  return -1;
 	}
 
-      tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
-      *error = TCP_ERROR_ACK_FUTURE;
       TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 2,
 		   vnet_buffer (b)->tcp.ack_number);
+
+      tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
+      *error = TCP_ERROR_ACK_FUTURE;
     }
 
   /* If old ACK, probably it's an old dupack */
@@ -863,7 +869,7 @@
    * timer. */
   if (tc->bytes_acked)
     {
-      TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc, vnet_buffer (b)->tcp.ack_number);
+      TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc);
 
       /* Updates congestion control (slow start/congestion avoidance) */
       tcp_cc_rcv_ack (tc, b);
@@ -966,11 +972,14 @@
       tc->rcv_nxt += written;
 
       /* Depending on how fast the app is, all remaining buffers in burst will
-       * not be enqueued. Should we inform peer of the damage? XXX */
+       * not be enqueued. Inform peer */
+      tc->flags |= TCP_CONN_SNDACK;
+
       return TCP_ERROR_PARTIALLY_ENQUEUED;
     }
   else
     {
+      tc->flags |= TCP_CONN_SNDACK;
       return TCP_ERROR_FIFO_FULL;
     }
 
@@ -1101,25 +1110,17 @@
       goto done;
     }
 
-  if (PREDICT_FALSE (error == TCP_ERROR_FIFO_FULL))
-    *next0 = TCP_NEXT_DROP;
-
   /* Check if ACK can be delayed */
-  if (!tcp_can_delack (tc))
-    {
-      /* Nothing to do for pure ACKs XXX */
-      if (n_data_bytes == 0)
-	goto done;
-
-      *next0 = tcp_next_output (tc->c_is_ip4);
-      tcp_make_ack (tc, b);
-    }
-  else
+  if (tcp_can_delack (tc))
     {
       if (!tcp_timer_is_active (tc, TCP_TIMER_DELACK))
 	tcp_timer_set (tc, TCP_TIMER_DELACK, TCP_DELACK_TIME);
+      goto done;
     }
 
+  *next0 = tcp_next_output (tc->c_is_ip4);
+  tcp_make_ack (tc, b);
+
 done:
   return error;
 }
@@ -2084,6 +2085,7 @@
 
 	  child0->irs = vnet_buffer (b0)->tcp.seq_number;
 	  child0->rcv_nxt = vnet_buffer (b0)->tcp.seq_number + 1;
+	  child0->rcv_las = child0->rcv_nxt;
 	  child0->state = TCP_STATE_SYN_RCVD;
 
 	  /* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK}
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index a671f72..ea157bd 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -155,8 +155,7 @@
   max_fifo = stream_session_fifo_size (&tc->connection);
 
   ASSERT (tc->opt.mss < max_fifo);
-
-  if (available_space < tc->opt.mss && available_space < max_fifo / 8)
+  if (available_space < tc->opt.mss && available_space < max_fifo >> 3)
     available_space = 0;
 
   /*
@@ -170,16 +169,21 @@
   /* Bad. Thou shalt not shrink */
   if (available_space < observed_wnd)
     {
-      /* Does happen! */
       wnd = observed_wnd;
+      TCP_EVT_DBG (TCP_EVT_RCV_WND_SHRUNK, tc, observed_wnd, available_space);
     }
   else
     {
       wnd = available_space;
     }
 
-  if (wnd && ((wnd << tc->rcv_wscale) >> tc->rcv_wscale != wnd))
-    wnd += 1 << tc->rcv_wscale;
+  /* Make sure we have a multiple of rcv_wscale */
+  if (wnd && tc->rcv_wscale)
+    {
+      wnd &= ~(1 << tc->rcv_wscale);
+      if (wnd == 0)
+	wnd = 1 << tc->rcv_wscale;
+    }
 
   tc->rcv_wnd = clib_min (wnd, TCP_WND_MAX << tc->rcv_wscale);
 }
@@ -462,8 +466,9 @@
 
   tcp_reuse_buffer (vm, b);
   tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_ACK);
-  vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK;
   TCP_EVT_DBG (TCP_EVT_ACK_SENT, tc);
+  vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK;
+  tc->rcv_las = tc->rcv_nxt;
 }
 
 /**
@@ -908,6 +913,7 @@
   vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
 
   tc->snd_nxt += data_len;
+
   /* TODO this is updated in output as well ... */
   if (tc->snd_nxt > tc->snd_una_max)
     tc->snd_una_max = tc->snd_nxt;
@@ -929,7 +935,6 @@
 
   /* Fill in the ACK */
   tcp_make_ack (tc, b);
-
   tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
 }
 
@@ -942,7 +947,6 @@
 
   tc = tcp_connection_get (index, thread_index);
   tc->timers[TCP_TIMER_DELACK] = TCP_TIMER_HANDLE_INVALID;
-//  tc->flags &= ~TCP_CONN_DELACK;
   tcp_send_ack (tc);
 }
 
@@ -995,7 +999,7 @@
  * Reset congestion control, switch cwnd to loss window and try again.
  */
 static void
-tcp_rtx_timeout_cc_recover (tcp_connection_t * tc)
+tcp_rtx_timeout_cc (tcp_connection_t * tc)
 {
   /* Cleanly recover cc (also clears up fast retransmit) */
   if (tcp_in_fastrecovery (tc))
@@ -1008,6 +1012,7 @@
     }
 
   /* Start again from the beginning */
+  tcp_recovery_on (tc);
   tc->cwnd = tcp_loss_wnd (tc);
   tc->snd_congestion = tc->snd_una_max;
 }
@@ -1048,7 +1053,7 @@
     {
       /* First retransmit timeout */
       if (tc->rto_boff == 1)
-	tcp_rtx_timeout_cc_recover (tc);
+	tcp_rtx_timeout_cc (tc);
 
       /* Exponential backoff */
       tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
@@ -1114,6 +1119,8 @@
     {
       ASSERT (tc->state == TCP_STATE_SYN_SENT);
 
+      TCP_EVT_DBG (TCP_EVT_SYN_RTX, tc);
+
       /* This goes straight to ipx_lookup */
       tcp_push_ip_hdr (tm, tc, b);
       tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4);
@@ -1137,6 +1144,55 @@
 }
 
 /**
+ * Got 0 snd_wnd from peer, try to do something about it.
+ *
+ */
+void
+tcp_timer_persist_handler (u32 index)
+{
+  tcp_main_t *tm = vnet_get_tcp_main ();
+  vlib_main_t *vm = vlib_get_main ();
+  u32 thread_index = os_get_cpu_number ();
+  tcp_connection_t *tc;
+  vlib_buffer_t *b;
+  u32 bi, n_bytes;
+
+  tc = tcp_connection_get (index, thread_index);
+
+  /* Make sure timer handle is set to invalid */
+  tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID;
+
+  /* Problem already solved or worse */
+  if (tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc))
+    return;
+
+  /* Increment RTO backoff */
+  tc->rto_boff += 1;
+  tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
+
+  /* Try to force the first unsent segment  */
+  tcp_get_free_buffer_index (tm, &bi);
+  b = vlib_get_buffer (vm, bi);
+  n_bytes = stream_session_peek_bytes (&tc->connection,
+				       vlib_buffer_get_current (b),
+				       tc->snd_una_max - tc->snd_una,
+				       tc->snd_mss);
+  /* Nothing to send */
+  if (n_bytes == 0)
+    {
+      tcp_return_buffer (tm);
+      return;
+    }
+
+  b->current_length = n_bytes;
+  tcp_push_hdr_i (tc, b, tc->state);
+  tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
+
+  /* Re-enable persist timer */
+  tcp_persist_timer_set (tc);
+}
+
+/**
  * Retransmit first unacked segment
  */
 void
@@ -1329,9 +1385,6 @@
 		}
 	    }
 
-	  /* Retransmitted SYNs do reach this but it should be harmless */
-	  tc0->rcv_las = tc0->rcv_nxt;
-
 	  /* Stop DELACK timer and fix flags */
 	  tc0->flags &= ~(TCP_CONN_SNDACK);
 	  if (tcp_timer_is_active (tc0, TCP_TIMER_DELACK))