tcp: improve rate samples for retansmitted segments
Type: fix
- Initialize max_seq on both transmitted and retransmitted segments
- Keep track of segments that have been sacked.
- Track new data segments sent during recovery
Change-Id: Ice55231a3da200ae6171702e54b2ce155f831143
Signed-off-by: Florin Coras <fcoras@cisco.com>
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index 75046ae..955b2dd 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -140,7 +140,6 @@
_(DEQ_PENDING, "Dequeue pending ") \
_(PSH_PENDING, "PSH pending") \
_(FINRCVD, "FIN received") \
- _(TRACK_BURST, "Track burst") \
_(ZERO_RWND_SENT, "Zero RWND sent") \
typedef enum tcp_connection_flag_bits_
@@ -256,6 +255,7 @@
{
TCP_BTS_IS_RXT = 1,
TCP_BTS_IS_APP_LIMITED = 1 << 1,
+ TCP_BTS_IS_SACKED = 1 << 2,
} __clib_packed tcp_bts_flags_t;
typedef struct tcp_bt_sample_
@@ -821,7 +821,7 @@
*
* @param tc tcp connection
*/
-void tcp_bt_track_tx (tcp_connection_t * tc);
+void tcp_bt_track_tx (tcp_connection_t * tc, u32 len);
/**
* Track a tcp retransmission
*
@@ -852,6 +852,7 @@
* @param bt byte tracker
*/
int tcp_bt_is_sane (tcp_byte_tracker_t * bt);
+u8 *format_tcp_bt (u8 * s, va_list * args);
always_inline u32
tcp_end_seq (tcp_header_t * th, u32 len)
diff --git a/src/vnet/tcp/tcp_bt.c b/src/vnet/tcp/tcp_bt.c
index b550fdd..b3f4e6a 100644
--- a/src/vnet/tcp/tcp_bt.c
+++ b/src/vnet/tcp/tcp_bt.c
@@ -53,13 +53,14 @@
}
static tcp_bt_sample_t *
-bt_alloc_sample (tcp_byte_tracker_t * bt, u32 min_seq)
+bt_alloc_sample (tcp_byte_tracker_t * bt, u32 min_seq, u32 max_seq)
{
tcp_bt_sample_t *bts;
pool_get_zero (bt->samples, bts);
bts->next = bts->prev = TCP_BTS_INVALID_INDEX;
bts->min_seq = min_seq;
+ bts->max_seq = max_seq;
rb_tree_add_custom (&bt->sample_lookup, bts->min_seq, bts - bt->samples,
bt_seq_lt);
return bts;
@@ -91,6 +92,47 @@
}
static tcp_bt_sample_t *
+bt_split_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts, u32 seq)
+{
+ tcp_bt_sample_t *ns, *next;
+ u32 bts_index;
+
+ bts_index = bt_sample_index (bt, bts);
+
+ ASSERT (seq_leq (bts->min_seq, seq) && seq_lt (seq, bts->max_seq));
+
+ ns = bt_alloc_sample (bt, seq, bts->max_seq);
+ bts = bt_get_sample (bt, bts_index);
+
+ *ns = *bts;
+ ns->min_seq = seq;
+ bts->max_seq = seq;
+
+ next = bt_next_sample (bt, bts);
+ if (next)
+ next->prev = bt_sample_index (bt, ns);
+ else
+ bt->tail = bt_sample_index (bt, ns);
+
+ bts->next = bt_sample_index (bt, ns);
+ ns->prev = bt_sample_index (bt, bts);
+
+ return ns;
+}
+
+static tcp_bt_sample_t *
+bt_merge_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * prev,
+ tcp_bt_sample_t * cur)
+{
+ ASSERT (prev->max_seq == cur->min_seq);
+ prev->max_seq = cur->max_seq;
+ if (bt_sample_index (bt, cur) == bt->tail)
+ bt->tail = bt_sample_index (bt, prev);
+ bt_free_sample (bt, cur);
+ return prev;
+}
+
+static tcp_bt_sample_t *
bt_lookup_seq (tcp_byte_tracker_t * bt, u32 seq)
{
rb_tree_t *rt = &bt->sample_lookup;
@@ -154,27 +196,16 @@
tcp_bt_sample_t *cur, *next;
cur = start;
- while ((next = bt_next_sample (bt, cur)) && seq_lt (next->min_seq, seq))
+ while (cur && seq_leq (cur->max_seq, seq))
{
+ next = bt_next_sample (bt, cur);
bt_free_sample (bt, cur);
cur = next;
}
- if (next)
- {
- bt_free_sample (bt, cur);
- return next;
- }
+ if (cur && seq_lt (cur->min_seq, seq))
+ bt_update_sample (bt, cur, seq);
- /* Overlapping current entirely */
- if (is_end)
- {
- bt_free_sample (bt, cur);
- return 0;
- }
-
- /* Overlapping head of current but not all */
- bt_update_sample (bt, cur, seq);
return cur;
}
@@ -235,10 +266,10 @@
}
static tcp_bt_sample_t *
-tcp_bt_alloc_tx_sample (tcp_connection_t * tc, u32 min_seq)
+tcp_bt_alloc_tx_sample (tcp_connection_t * tc, u32 min_seq, u32 max_seq)
{
tcp_bt_sample_t *bts;
- bts = bt_alloc_sample (tc->bt, min_seq);
+ bts = bt_alloc_sample (tc->bt, min_seq, max_seq);
bts->delivered = tc->delivered;
bts->delivered_time = tc->delivered_time;
bts->tx_time = tcp_time_now_us (tc->c_thread_index);
@@ -263,19 +294,27 @@
}
void
-tcp_bt_track_tx (tcp_connection_t * tc)
+tcp_bt_track_tx (tcp_connection_t * tc, u32 len)
{
tcp_byte_tracker_t *bt = tc->bt;
tcp_bt_sample_t *bts, *tail;
u32 bts_index;
+ tail = bt_get_sample (bt, bt->tail);
+ if (tail && tail->max_seq == tc->snd_nxt
+ && tail->tx_time == tcp_time_now_us (tc->c_thread_index))
+ {
+ tail->max_seq += len;
+ return;
+ }
+
if (tc->snd_una == tc->snd_nxt)
{
tc->delivered_time = tcp_time_now_us (tc->c_thread_index);
tc->first_tx_time = tc->delivered_time;
}
- bts = tcp_bt_alloc_tx_sample (tc, tc->snd_nxt);
+ bts = tcp_bt_alloc_tx_sample (tc, tc->snd_nxt, tc->snd_nxt + len);
bts_index = bt_sample_index (bt, bts);
tail = bt_get_sample (bt, bt->tail);
if (tail)
@@ -295,11 +334,13 @@
{
tcp_byte_tracker_t *bt = tc->bt;
tcp_bt_sample_t *bts, *next, *cur, *prev, *nbts;
- u32 bts_index, cur_index, next_index, prev_index, min_seq;
+ u32 bts_index, cur_index, next_index, prev_index, max_seq;
u8 is_end = end == tc->snd_nxt;
+ /* Contiguous blocks retransmitted at the same time */
bts = bt_get_sample (bt, bt->last_ooo);
- if (bts && bts->max_seq == start)
+ if (bts && bts->max_seq == start
+ && bts->tx_time == tcp_time_now_us (tc->c_thread_index))
{
bts->max_seq = end;
next = bt_next_sample (bt, bts);
@@ -325,8 +366,7 @@
next = bt_fix_overlapped (bt, bts, end, is_end);
next_index = bt_sample_index (bt, next);
- cur = tcp_bt_alloc_tx_sample (tc, start);
- cur->max_seq = end;
+ cur = tcp_bt_alloc_tx_sample (tc, start, end);
cur->flags |= TCP_BTS_IS_RXT;
cur->next = next_index;
cur->prev = prev_index;
@@ -362,20 +402,19 @@
if (next)
next = bt_fix_overlapped (bt, next, end, is_end);
- min_seq = next ? next->min_seq : tc->snd_nxt;
- ASSERT (seq_lt (start, min_seq));
+ max_seq = bts->max_seq;
+ ASSERT (seq_lt (start, max_seq));
/* Have to split or tail overlap */
- cur = tcp_bt_alloc_tx_sample (tc, start);
- cur->max_seq = end;
+ cur = tcp_bt_alloc_tx_sample (tc, start, end);
cur->flags |= TCP_BTS_IS_RXT;
cur->prev = bts_index;
cur_index = bt_sample_index (bt, cur);
/* Split. Allocate another sample */
- if (seq_lt (end, min_seq))
+ if (seq_lt (end, max_seq))
{
- nbts = tcp_bt_alloc_tx_sample (tc, end);
+ nbts = tcp_bt_alloc_tx_sample (tc, end, bts->max_seq);
cur = bt_get_sample (bt, cur_index);
bts = bt_get_sample (bt, bts_index);
@@ -393,12 +432,14 @@
bts->next = nbts->prev = cur_index;
cur->next = bt_sample_index (bt, nbts);
+ bts->max_seq = start;
bt->last_ooo = cur_index;
}
/* Tail completely overlapped */
else
{
bts = bt_get_sample (bt, bts_index);
+ bts->max_seq = start;
if (bts->next != TCP_BTS_INVALID_INDEX)
{
@@ -419,13 +460,16 @@
tcp_bt_sample_to_rate_sample (tcp_connection_t * tc, tcp_bt_sample_t * bts,
tcp_rate_sample_t * rs)
{
+ if (bts->flags & TCP_BTS_IS_SACKED)
+ return;
+
if (rs->prior_delivered && rs->prior_delivered >= bts->delivered)
return;
rs->prior_delivered = bts->delivered;
rs->prior_time = bts->delivered_time;
rs->interval_time = bts->tx_time - bts->first_tx_time;
- rs->rtt_time = bts->tx_time;
+ rs->rtt_time = tc->delivered_time - bts->tx_time;
rs->flags = bts->flags;
tc->first_tx_time = bts->tx_time;
}
@@ -437,31 +481,16 @@
tcp_bt_sample_t *next, *cur;
cur = bt_get_sample (bt, bt->head);
- tcp_bt_sample_to_rate_sample (tc, cur, rs);
- while ((next = bt_get_sample (bt, cur->next))
- && seq_lt (next->min_seq, tc->snd_una))
+ while (cur && seq_leq (cur->max_seq, tc->snd_una))
{
- bt_free_sample (bt, cur);
- tcp_bt_sample_to_rate_sample (tc, next, rs);
- cur = next;
- }
-
- ASSERT (seq_lt (cur->min_seq, tc->snd_una));
-
- /* All samples acked */
- if (tc->snd_una == tc->snd_nxt)
- {
- ASSERT (pool_elts (bt->samples) == 1);
- bt_free_sample (bt, cur);
- return;
- }
-
- /* Current sample completely consumed */
- if (next && next->min_seq == tc->snd_una)
- {
+ next = bt_next_sample (bt, cur);
+ tcp_bt_sample_to_rate_sample (tc, cur, rs);
bt_free_sample (bt, cur);
cur = next;
}
+
+ if (cur && seq_lt (cur->min_seq, tc->snd_una))
+ tcp_bt_sample_to_rate_sample (tc, cur, rs);
}
static void
@@ -469,7 +498,7 @@
{
sack_block_t *blks = tc->rcv_opts.sacks, *blk;
tcp_byte_tracker_t *bt = tc->bt;
- tcp_bt_sample_t *next, *cur;
+ tcp_bt_sample_t *cur, *prev, *next;
int i;
for (i = 0; i < vec_len (blks); i++)
@@ -484,27 +513,64 @@
if (!cur)
continue;
- tcp_bt_sample_to_rate_sample (tc, cur, rs);
+ ASSERT (seq_geq (blk->start, cur->min_seq)
+ && seq_lt (blk->start, cur->max_seq));
- /* Current shouldn't be removed */
- if (cur->min_seq != blk->start)
+ /* Current should be split. Second part will be consumed */
+ if (PREDICT_FALSE (cur->min_seq != blk->start))
{
- cur = bt_next_sample (bt, cur);
- if (!cur)
- continue;
+ cur = bt_split_sample (bt, cur, blk->start);
+ prev = bt_prev_sample (bt, cur);
}
+ else
+ prev = bt_prev_sample (bt, cur);
- while ((next = bt_get_sample (bt, cur->next))
- && seq_lt (next->min_seq, blk->end))
+ while (cur && seq_leq (cur->max_seq, blk->end))
{
- bt_free_sample (bt, cur);
- tcp_bt_sample_to_rate_sample (tc, next, rs);
+ if (!(cur->flags & TCP_BTS_IS_SACKED))
+ {
+ tcp_bt_sample_to_rate_sample (tc, cur, rs);
+ cur->flags |= TCP_BTS_IS_SACKED;
+ if (prev && (prev->flags & TCP_BTS_IS_SACKED))
+ {
+ cur = bt_merge_sample (bt, prev, cur);
+ next = bt_next_sample (bt, cur);
+ }
+ else
+ {
+ next = bt_next_sample (bt, cur);
+ if (next && (next->flags & TCP_BTS_IS_SACKED))
+ {
+ cur = bt_merge_sample (bt, cur, next);
+ next = bt_next_sample (bt, cur);
+ }
+ }
+ }
+ else
+ next = bt_next_sample (bt, cur);
+
+ prev = cur;
cur = next;
}
- /* Current consumed entirely */
- if (next && next->min_seq == blk->end)
- bt_free_sample (bt, cur);
+ if (cur && seq_lt (cur->min_seq, blk->end))
+ {
+ tcp_bt_sample_to_rate_sample (tc, cur, rs);
+ prev = bt_prev_sample (bt, cur);
+ /* Extend previous to include the newly sacked bytes */
+ if (prev && (prev->flags & TCP_BTS_IS_SACKED))
+ {
+ prev->max_seq = blk->end;
+ bt_update_sample (bt, cur, blk->end);
+ }
+ /* Split sample into two. First part is consumed */
+ else
+ {
+ next = bt_split_sample (bt, cur, blk->end);
+ cur = bt_prev_sample (bt, next);
+ cur->flags |= TCP_BTS_IS_SACKED;
+ }
+ }
}
}
@@ -533,10 +599,9 @@
if (tc->sack_sb.last_sacked_bytes)
tcp_bt_walk_samples_ooo (tc, rs);
- rs->interval_time = clib_max (tc->delivered_time - rs->prior_time,
+ rs->interval_time = clib_max ((tc->delivered_time - rs->prior_time),
rs->interval_time);
rs->delivered = tc->delivered - rs->prior_delivered;
- rs->rtt_time = tc->delivered_time - rs->rtt_time;
rs->acked_and_sacked = delivered;
rs->lost = tc->sack_sb.last_lost_bytes;
}
@@ -590,6 +655,36 @@
tc->bt = bt;
}
+u8 *
+format_tcp_bt_sample (u8 * s, va_list * args)
+{
+ tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+ tcp_bt_sample_t *bts = va_arg (*args, tcp_bt_sample_t *);
+ f64 now = tcp_time_now_us (tc->c_thread_index);
+ s = format (s, "[%u, %u] d %u dt %.3f txt %.3f ftxt %.3f flags 0x%x",
+ bts->min_seq - tc->iss, bts->max_seq - tc->iss, bts->delivered,
+ now - bts->delivered_time, now - bts->tx_time,
+ now - bts->first_tx_time, bts->flags);
+ return s;
+}
+
+u8 *
+format_tcp_bt (u8 * s, va_list * args)
+{
+ tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+ tcp_byte_tracker_t *bt = tc->bt;
+ tcp_bt_sample_t *bts;
+
+ bts = bt_get_sample (bt, bt->head);
+ while (bts)
+ {
+ s = format (s, "%U\n", format_tcp_bt_sample, tc, bts);
+ bts = bt_next_sample (bt, bts);
+ }
+
+ return s;
+}
+
/*
* fd.io coding-style-patch-verification: ON
*
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index 0047f3c..4528216 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -406,7 +406,7 @@
tcp_update_rcv_wnd (tc);
if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
- tc->flags |= TCP_CONN_TRACK_BURST;
+ tcp_bt_check_app_limited (tc);
if (tc->snd_una == tc->snd_nxt)
{
@@ -1124,17 +1124,22 @@
TCP_EVT (TCP_EVT_PKTIZE, tc);
}
+always_inline u32
+tcp_buffer_len (vlib_buffer_t * b)
+{
+ u32 data_len = b->current_length;
+ if (PREDICT_FALSE (b->flags & VLIB_BUFFER_NEXT_PRESENT))
+ data_len += b->total_length_not_including_first_buffer;
+ return data_len;
+}
+
u32
tcp_session_push_header (transport_connection_t * tconn, vlib_buffer_t * b)
{
tcp_connection_t *tc = (tcp_connection_t *) tconn;
- if (tc->flags & TCP_CONN_TRACK_BURST)
- {
- tcp_bt_check_app_limited (tc);
- tcp_bt_track_tx (tc);
- tc->flags &= ~TCP_CONN_TRACK_BURST;
- }
+ if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
+ tcp_bt_track_tx (tc, tcp_buffer_len (b));
tcp_push_hdr_i (tc, b, tc->snd_nxt, /* compute opts */ 0, /* burst */ 1,
/* update_snd_nxt */ 1);
@@ -1736,7 +1741,7 @@
if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
{
tcp_bt_check_app_limited (tc);
- tcp_bt_track_tx (tc);
+ tcp_bt_track_tx (tc, n_bytes);
}
tcp_push_hdr_i (tc, b, tc->snd_nxt, /* compute opts */ 0,
@@ -1783,6 +1788,9 @@
available_wnd = tc->snd_wnd - offset;
burst_size = clib_min (burst_size, available_wnd / tc->snd_mss);
+ if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
+ tcp_bt_check_app_limited (tc);
+
while (n_segs < burst_size)
{
n_written = tcp_prepare_segment (wrk, tc, offset, tc->snd_mss, &b);
@@ -1794,6 +1802,9 @@
offset += n_written;
n_segs += 1;
+ if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
+ tcp_bt_track_tx (tc, n_written);
+
tc->snd_nxt += n_written;
tc->snd_una_max = seq_max (tc->snd_nxt, tc->snd_una_max);
}