tcp: improve rate samples for retansmitted segments

Type: fix

- Initialize max_seq on both transmitted and retransmitted segments
- Keep track of segments that have been sacked.
- Track new data segments sent during recovery

Change-Id: Ice55231a3da200ae6171702e54b2ce155f831143
Signed-off-by: Florin Coras <fcoras@cisco.com>
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index 75046ae..955b2dd 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -140,7 +140,6 @@
   _(DEQ_PENDING, "Dequeue pending ")		\
   _(PSH_PENDING, "PSH pending")			\
   _(FINRCVD, "FIN received")			\
-  _(TRACK_BURST, "Track burst")			\
   _(ZERO_RWND_SENT, "Zero RWND sent")		\
 
 typedef enum tcp_connection_flag_bits_
@@ -256,6 +255,7 @@
 {
   TCP_BTS_IS_RXT = 1,
   TCP_BTS_IS_APP_LIMITED = 1 << 1,
+  TCP_BTS_IS_SACKED = 1 << 2,
 } __clib_packed tcp_bts_flags_t;
 
 typedef struct tcp_bt_sample_
@@ -821,7 +821,7 @@
  *
  * @param tc	tcp connection
  */
-void tcp_bt_track_tx (tcp_connection_t * tc);
+void tcp_bt_track_tx (tcp_connection_t * tc, u32 len);
 /**
  * Track a tcp retransmission
  *
@@ -852,6 +852,7 @@
  * @param bt	byte tracker
  */
 int tcp_bt_is_sane (tcp_byte_tracker_t * bt);
+u8 *format_tcp_bt (u8 * s, va_list * args);
 
 always_inline u32
 tcp_end_seq (tcp_header_t * th, u32 len)
diff --git a/src/vnet/tcp/tcp_bt.c b/src/vnet/tcp/tcp_bt.c
index b550fdd..b3f4e6a 100644
--- a/src/vnet/tcp/tcp_bt.c
+++ b/src/vnet/tcp/tcp_bt.c
@@ -53,13 +53,14 @@
 }
 
 static tcp_bt_sample_t *
-bt_alloc_sample (tcp_byte_tracker_t * bt, u32 min_seq)
+bt_alloc_sample (tcp_byte_tracker_t * bt, u32 min_seq, u32 max_seq)
 {
   tcp_bt_sample_t *bts;
 
   pool_get_zero (bt->samples, bts);
   bts->next = bts->prev = TCP_BTS_INVALID_INDEX;
   bts->min_seq = min_seq;
+  bts->max_seq = max_seq;
   rb_tree_add_custom (&bt->sample_lookup, bts->min_seq, bts - bt->samples,
 		      bt_seq_lt);
   return bts;
@@ -91,6 +92,47 @@
 }
 
 static tcp_bt_sample_t *
+bt_split_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts, u32 seq)
+{
+  tcp_bt_sample_t *ns, *next;
+  u32 bts_index;
+
+  bts_index = bt_sample_index (bt, bts);
+
+  ASSERT (seq_leq (bts->min_seq, seq) && seq_lt (seq, bts->max_seq));
+
+  ns = bt_alloc_sample (bt, seq, bts->max_seq);
+  bts = bt_get_sample (bt, bts_index);
+
+  *ns = *bts;
+  ns->min_seq = seq;
+  bts->max_seq = seq;
+
+  next = bt_next_sample (bt, bts);
+  if (next)
+    next->prev = bt_sample_index (bt, ns);
+  else
+    bt->tail = bt_sample_index (bt, ns);
+
+  bts->next = bt_sample_index (bt, ns);
+  ns->prev = bt_sample_index (bt, bts);
+
+  return ns;
+}
+
+static tcp_bt_sample_t *
+bt_merge_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * prev,
+		 tcp_bt_sample_t * cur)
+{
+  ASSERT (prev->max_seq == cur->min_seq);
+  prev->max_seq = cur->max_seq;
+  if (bt_sample_index (bt, cur) == bt->tail)
+    bt->tail = bt_sample_index (bt, prev);
+  bt_free_sample (bt, cur);
+  return prev;
+}
+
+static tcp_bt_sample_t *
 bt_lookup_seq (tcp_byte_tracker_t * bt, u32 seq)
 {
   rb_tree_t *rt = &bt->sample_lookup;
@@ -154,27 +196,16 @@
   tcp_bt_sample_t *cur, *next;
 
   cur = start;
-  while ((next = bt_next_sample (bt, cur)) && seq_lt (next->min_seq, seq))
+  while (cur && seq_leq (cur->max_seq, seq))
     {
+      next = bt_next_sample (bt, cur);
       bt_free_sample (bt, cur);
       cur = next;
     }
 
-  if (next)
-    {
-      bt_free_sample (bt, cur);
-      return next;
-    }
+  if (cur && seq_lt (cur->min_seq, seq))
+    bt_update_sample (bt, cur, seq);
 
-  /* Overlapping current entirely */
-  if (is_end)
-    {
-      bt_free_sample (bt, cur);
-      return 0;
-    }
-
-  /* Overlapping head of current but not all */
-  bt_update_sample (bt, cur, seq);
   return cur;
 }
 
@@ -235,10 +266,10 @@
 }
 
 static tcp_bt_sample_t *
-tcp_bt_alloc_tx_sample (tcp_connection_t * tc, u32 min_seq)
+tcp_bt_alloc_tx_sample (tcp_connection_t * tc, u32 min_seq, u32 max_seq)
 {
   tcp_bt_sample_t *bts;
-  bts = bt_alloc_sample (tc->bt, min_seq);
+  bts = bt_alloc_sample (tc->bt, min_seq, max_seq);
   bts->delivered = tc->delivered;
   bts->delivered_time = tc->delivered_time;
   bts->tx_time = tcp_time_now_us (tc->c_thread_index);
@@ -263,19 +294,27 @@
 }
 
 void
-tcp_bt_track_tx (tcp_connection_t * tc)
+tcp_bt_track_tx (tcp_connection_t * tc, u32 len)
 {
   tcp_byte_tracker_t *bt = tc->bt;
   tcp_bt_sample_t *bts, *tail;
   u32 bts_index;
 
+  tail = bt_get_sample (bt, bt->tail);
+  if (tail && tail->max_seq == tc->snd_nxt
+      && tail->tx_time == tcp_time_now_us (tc->c_thread_index))
+    {
+      tail->max_seq += len;
+      return;
+    }
+
   if (tc->snd_una == tc->snd_nxt)
     {
       tc->delivered_time = tcp_time_now_us (tc->c_thread_index);
       tc->first_tx_time = tc->delivered_time;
     }
 
-  bts = tcp_bt_alloc_tx_sample (tc, tc->snd_nxt);
+  bts = tcp_bt_alloc_tx_sample (tc, tc->snd_nxt, tc->snd_nxt + len);
   bts_index = bt_sample_index (bt, bts);
   tail = bt_get_sample (bt, bt->tail);
   if (tail)
@@ -295,11 +334,13 @@
 {
   tcp_byte_tracker_t *bt = tc->bt;
   tcp_bt_sample_t *bts, *next, *cur, *prev, *nbts;
-  u32 bts_index, cur_index, next_index, prev_index, min_seq;
+  u32 bts_index, cur_index, next_index, prev_index, max_seq;
   u8 is_end = end == tc->snd_nxt;
 
+  /* Contiguous blocks retransmitted at the same time */
   bts = bt_get_sample (bt, bt->last_ooo);
-  if (bts && bts->max_seq == start)
+  if (bts && bts->max_seq == start
+      && bts->tx_time == tcp_time_now_us (tc->c_thread_index))
     {
       bts->max_seq = end;
       next = bt_next_sample (bt, bts);
@@ -325,8 +366,7 @@
       next = bt_fix_overlapped (bt, bts, end, is_end);
       next_index = bt_sample_index (bt, next);
 
-      cur = tcp_bt_alloc_tx_sample (tc, start);
-      cur->max_seq = end;
+      cur = tcp_bt_alloc_tx_sample (tc, start, end);
       cur->flags |= TCP_BTS_IS_RXT;
       cur->next = next_index;
       cur->prev = prev_index;
@@ -362,20 +402,19 @@
   if (next)
     next = bt_fix_overlapped (bt, next, end, is_end);
 
-  min_seq = next ? next->min_seq : tc->snd_nxt;
-  ASSERT (seq_lt (start, min_seq));
+  max_seq = bts->max_seq;
+  ASSERT (seq_lt (start, max_seq));
 
   /* Have to split or tail overlap */
-  cur = tcp_bt_alloc_tx_sample (tc, start);
-  cur->max_seq = end;
+  cur = tcp_bt_alloc_tx_sample (tc, start, end);
   cur->flags |= TCP_BTS_IS_RXT;
   cur->prev = bts_index;
   cur_index = bt_sample_index (bt, cur);
 
   /* Split. Allocate another sample */
-  if (seq_lt (end, min_seq))
+  if (seq_lt (end, max_seq))
     {
-      nbts = tcp_bt_alloc_tx_sample (tc, end);
+      nbts = tcp_bt_alloc_tx_sample (tc, end, bts->max_seq);
       cur = bt_get_sample (bt, cur_index);
       bts = bt_get_sample (bt, bts_index);
 
@@ -393,12 +432,14 @@
       bts->next = nbts->prev = cur_index;
       cur->next = bt_sample_index (bt, nbts);
 
+      bts->max_seq = start;
       bt->last_ooo = cur_index;
     }
   /* Tail completely overlapped */
   else
     {
       bts = bt_get_sample (bt, bts_index);
+      bts->max_seq = start;
 
       if (bts->next != TCP_BTS_INVALID_INDEX)
 	{
@@ -419,13 +460,16 @@
 tcp_bt_sample_to_rate_sample (tcp_connection_t * tc, tcp_bt_sample_t * bts,
 			      tcp_rate_sample_t * rs)
 {
+  if (bts->flags & TCP_BTS_IS_SACKED)
+    return;
+
   if (rs->prior_delivered && rs->prior_delivered >= bts->delivered)
     return;
 
   rs->prior_delivered = bts->delivered;
   rs->prior_time = bts->delivered_time;
   rs->interval_time = bts->tx_time - bts->first_tx_time;
-  rs->rtt_time = bts->tx_time;
+  rs->rtt_time = tc->delivered_time - bts->tx_time;
   rs->flags = bts->flags;
   tc->first_tx_time = bts->tx_time;
 }
@@ -437,31 +481,16 @@
   tcp_bt_sample_t *next, *cur;
 
   cur = bt_get_sample (bt, bt->head);
-  tcp_bt_sample_to_rate_sample (tc, cur, rs);
-  while ((next = bt_get_sample (bt, cur->next))
-	 && seq_lt (next->min_seq, tc->snd_una))
+  while (cur && seq_leq (cur->max_seq, tc->snd_una))
     {
-      bt_free_sample (bt, cur);
-      tcp_bt_sample_to_rate_sample (tc, next, rs);
-      cur = next;
-    }
-
-  ASSERT (seq_lt (cur->min_seq, tc->snd_una));
-
-  /* All samples acked */
-  if (tc->snd_una == tc->snd_nxt)
-    {
-      ASSERT (pool_elts (bt->samples) == 1);
-      bt_free_sample (bt, cur);
-      return;
-    }
-
-  /* Current sample completely consumed */
-  if (next && next->min_seq == tc->snd_una)
-    {
+      next = bt_next_sample (bt, cur);
+      tcp_bt_sample_to_rate_sample (tc, cur, rs);
       bt_free_sample (bt, cur);
       cur = next;
     }
+
+  if (cur && seq_lt (cur->min_seq, tc->snd_una))
+    tcp_bt_sample_to_rate_sample (tc, cur, rs);
 }
 
 static void
@@ -469,7 +498,7 @@
 {
   sack_block_t *blks = tc->rcv_opts.sacks, *blk;
   tcp_byte_tracker_t *bt = tc->bt;
-  tcp_bt_sample_t *next, *cur;
+  tcp_bt_sample_t *cur, *prev, *next;
   int i;
 
   for (i = 0; i < vec_len (blks); i++)
@@ -484,27 +513,64 @@
       if (!cur)
 	continue;
 
-      tcp_bt_sample_to_rate_sample (tc, cur, rs);
+      ASSERT (seq_geq (blk->start, cur->min_seq)
+	      && seq_lt (blk->start, cur->max_seq));
 
-      /* Current shouldn't be removed */
-      if (cur->min_seq != blk->start)
+      /* Current should be split. Second part will be consumed */
+      if (PREDICT_FALSE (cur->min_seq != blk->start))
 	{
-	  cur = bt_next_sample (bt, cur);
-	  if (!cur)
-	    continue;
+	  cur = bt_split_sample (bt, cur, blk->start);
+	  prev = bt_prev_sample (bt, cur);
 	}
+      else
+	prev = bt_prev_sample (bt, cur);
 
-      while ((next = bt_get_sample (bt, cur->next))
-	     && seq_lt (next->min_seq, blk->end))
+      while (cur && seq_leq (cur->max_seq, blk->end))
 	{
-	  bt_free_sample (bt, cur);
-	  tcp_bt_sample_to_rate_sample (tc, next, rs);
+	  if (!(cur->flags & TCP_BTS_IS_SACKED))
+	    {
+	      tcp_bt_sample_to_rate_sample (tc, cur, rs);
+	      cur->flags |= TCP_BTS_IS_SACKED;
+	      if (prev && (prev->flags & TCP_BTS_IS_SACKED))
+		{
+		  cur = bt_merge_sample (bt, prev, cur);
+		  next = bt_next_sample (bt, cur);
+		}
+	      else
+		{
+		  next = bt_next_sample (bt, cur);
+		  if (next && (next->flags & TCP_BTS_IS_SACKED))
+		    {
+		      cur = bt_merge_sample (bt, cur, next);
+		      next = bt_next_sample (bt, cur);
+		    }
+		}
+	    }
+	  else
+	    next = bt_next_sample (bt, cur);
+
+	  prev = cur;
 	  cur = next;
 	}
 
-      /* Current consumed entirely */
-      if (next && next->min_seq == blk->end)
-	bt_free_sample (bt, cur);
+      if (cur && seq_lt (cur->min_seq, blk->end))
+	{
+	  tcp_bt_sample_to_rate_sample (tc, cur, rs);
+	  prev = bt_prev_sample (bt, cur);
+	  /* Extend previous to include the newly sacked bytes */
+	  if (prev && (prev->flags & TCP_BTS_IS_SACKED))
+	    {
+	      prev->max_seq = blk->end;
+	      bt_update_sample (bt, cur, blk->end);
+	    }
+	  /* Split sample into two. First part is consumed */
+	  else
+	    {
+	      next = bt_split_sample (bt, cur, blk->end);
+	      cur = bt_prev_sample (bt, next);
+	      cur->flags |= TCP_BTS_IS_SACKED;
+	    }
+	}
     }
 }
 
@@ -533,10 +599,9 @@
   if (tc->sack_sb.last_sacked_bytes)
     tcp_bt_walk_samples_ooo (tc, rs);
 
-  rs->interval_time = clib_max (tc->delivered_time - rs->prior_time,
+  rs->interval_time = clib_max ((tc->delivered_time - rs->prior_time),
 				rs->interval_time);
   rs->delivered = tc->delivered - rs->prior_delivered;
-  rs->rtt_time = tc->delivered_time - rs->rtt_time;
   rs->acked_and_sacked = delivered;
   rs->lost = tc->sack_sb.last_lost_bytes;
 }
@@ -590,6 +655,36 @@
   tc->bt = bt;
 }
 
+u8 *
+format_tcp_bt_sample (u8 * s, va_list * args)
+{
+  tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+  tcp_bt_sample_t *bts = va_arg (*args, tcp_bt_sample_t *);
+  f64 now = tcp_time_now_us (tc->c_thread_index);
+  s = format (s, "[%u, %u] d %u dt %.3f txt %.3f ftxt %.3f flags 0x%x",
+	      bts->min_seq - tc->iss, bts->max_seq - tc->iss, bts->delivered,
+	      now - bts->delivered_time, now - bts->tx_time,
+	      now - bts->first_tx_time, bts->flags);
+  return s;
+}
+
+u8 *
+format_tcp_bt (u8 * s, va_list * args)
+{
+  tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+  tcp_byte_tracker_t *bt = tc->bt;
+  tcp_bt_sample_t *bts;
+
+  bts = bt_get_sample (bt, bt->head);
+  while (bts)
+    {
+      s = format (s, "%U\n", format_tcp_bt_sample, tc, bts);
+      bts = bt_next_sample (bt, bts);
+    }
+
+  return s;
+}
+
 /*
  * fd.io coding-style-patch-verification: ON
  *
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index 0047f3c..4528216 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -406,7 +406,7 @@
   tcp_update_rcv_wnd (tc);
 
   if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
-    tc->flags |= TCP_CONN_TRACK_BURST;
+    tcp_bt_check_app_limited (tc);
 
   if (tc->snd_una == tc->snd_nxt)
     {
@@ -1124,17 +1124,22 @@
   TCP_EVT (TCP_EVT_PKTIZE, tc);
 }
 
+always_inline u32
+tcp_buffer_len (vlib_buffer_t * b)
+{
+  u32 data_len = b->current_length;
+  if (PREDICT_FALSE (b->flags & VLIB_BUFFER_NEXT_PRESENT))
+    data_len += b->total_length_not_including_first_buffer;
+  return data_len;
+}
+
 u32
 tcp_session_push_header (transport_connection_t * tconn, vlib_buffer_t * b)
 {
   tcp_connection_t *tc = (tcp_connection_t *) tconn;
 
-  if (tc->flags & TCP_CONN_TRACK_BURST)
-    {
-      tcp_bt_check_app_limited (tc);
-      tcp_bt_track_tx (tc);
-      tc->flags &= ~TCP_CONN_TRACK_BURST;
-    }
+  if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
+    tcp_bt_track_tx (tc, tcp_buffer_len (b));
 
   tcp_push_hdr_i (tc, b, tc->snd_nxt, /* compute opts */ 0, /* burst */ 1,
 		  /* update_snd_nxt */ 1);
@@ -1736,7 +1741,7 @@
   if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
     {
       tcp_bt_check_app_limited (tc);
-      tcp_bt_track_tx (tc);
+      tcp_bt_track_tx (tc, n_bytes);
     }
 
   tcp_push_hdr_i (tc, b, tc->snd_nxt, /* compute opts */ 0,
@@ -1783,6 +1788,9 @@
   available_wnd = tc->snd_wnd - offset;
   burst_size = clib_min (burst_size, available_wnd / tc->snd_mss);
 
+  if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
+    tcp_bt_check_app_limited (tc);
+
   while (n_segs < burst_size)
     {
       n_written = tcp_prepare_segment (wrk, tc, offset, tc->snd_mss, &b);
@@ -1794,6 +1802,9 @@
       offset += n_written;
       n_segs += 1;
 
+      if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
+	tcp_bt_track_tx (tc, n_written);
+
       tc->snd_nxt += n_written;
       tc->snd_una_max = seq_max (tc->snd_nxt, tc->snd_una_max);
     }