tcp: delivery rate estimator
Type: feature
First cut implementation with limited testing. The feature is not
enabled by default and the expectation is that cc algorithms will enable
it on demand.
Change-Id: I92b70cb4dabcff0e9ccd1d725952c4880af394da
Signed-off-by: Florin Coras <fcoras@cisco.com>
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
index f72b957..03110e5 100644
--- a/src/vnet/tcp/tcp.c
+++ b/src/vnet/tcp/tcp.c
@@ -270,6 +270,9 @@
vec_free (tc->snd_sacks);
vec_free (tc->snd_sacks_fl);
+ if (tc->flags & TCP_CONN_RATE_SAMPLE)
+ tcp_bt_cleanup (tc);
+
/* Poison the entry */
if (CLIB_DEBUG > 0)
clib_memset (tc, 0xFA, sizeof (*tc));
@@ -662,6 +665,9 @@
if (transport_connection_is_tx_paced (&tc->connection)
|| tcp_main.tx_pacing)
tcp_enable_pacing (tc);
+
+ if (tc->flags & TCP_CONN_RATE_SAMPLE)
+ tcp_bt_init (tc);
}
static int
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index bc1e3c0..b0c3ecc 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -128,6 +128,8 @@
_(DEQ_PENDING, "Pending dequeue acked") \
_(PSH_PENDING, "PSH pending") \
_(FINRCVD, "FIN received") \
+ _(RATE_SAMPLE, "Conn does rate sampling") \
+ _(TRACK_BURST, "Track burst") \
typedef enum _tcp_connection_flag_bits
{
@@ -174,7 +176,7 @@
u32 tail; /**< Index of last entry */
u32 sacked_bytes; /**< Number of bytes sacked in sb */
u32 last_sacked_bytes; /**< Number of bytes last sacked */
- u32 last_bytes_delivered; /**< Number of sack bytes delivered */
+ u32 last_bytes_delivered; /**< Sack bytes delivered to app */
u32 snd_una_adv; /**< Bytes to add to snd_una */
u32 high_sacked; /**< Highest byte sacked (fack) */
u32 high_rxt; /**< Highest retransmitted sequence */
@@ -231,6 +233,44 @@
void scoreboard_init (sack_scoreboard_t * sb);
u8 *format_tcp_scoreboard (u8 * s, va_list * args);
+#define TCP_BTS_INVALID_INDEX ((u32)~0)
+
+typedef enum tcp_bts_flags_
+{
+ TCP_BTS_IS_RXT = 1,
+ TCP_BTS_IS_APP_LIMITED = 1 << 1,
+} __clib_packed tcp_bts_flags_t;
+
+typedef struct tcp_bt_sample_
+{
+ u32 next; /**< Next sample index in list */
+ u32 prev; /**< Previous sample index in list */
+ u32 min_seq; /**< Min seq number in sample */
+ u32 max_seq; /**< Max seq number. Set for rxt samples */
+ u64 delivered; /**< Total delivered when sample taken */
+ f64 delivered_time; /**< Delivered time when sample taken */
+ u64 tx_rate; /**< Tx pacing rate */
+ tcp_bts_flags_t flags; /**< Sample flag */
+} tcp_bt_sample_t;
+
+typedef struct tcp_rate_sample_
+{
+ u64 sample_delivered; /**< Delivered of sample used for rate */
+ u32 delivered; /**< Bytes delivered in ack time */
+ f64 ack_time; /**< Time to ack the bytes delivered */
+ u64 tx_rate; /**< Tx pacing rate */
+ tcp_bts_flags_t flags; /**< Rate sample flags from bt sample */
+} tcp_rate_sample_t;
+
+typedef struct tcp_byte_tracker_
+{
+ tcp_bt_sample_t *samples; /**< Pool of samples */
+ rb_tree_t sample_lookup; /**< Rbtree for sample lookup by min_seq */
+ u32 head; /**< Head of samples linked list */
+ u32 tail; /**< Tail of samples linked list */
+ u32 last_ooo; /**< Cached last ooo sample */
+} tcp_byte_tracker_t;
+
typedef enum _tcp_cc_algorithm_type
{
TCP_CC_NEWRENO,
@@ -304,6 +344,7 @@
u32 snd_rxt_ts; /**< Timestamp when first packet is retransmitted */
u32 tsecr_last_ack; /**< Timestamp echoed to us in last healthy ACK */
u32 snd_congestion; /**< snd_una_max when congestion is detected */
+ u32 tx_fifo_size; /**< Tx fifo size. Used to constrain cwnd */
tcp_cc_algorithm_t *cc_algo; /**< Congestion control algorithm */
u8 cc_data[TCP_CC_DATA_SZ]; /**< Congestion control algo private data */
@@ -316,15 +357,20 @@
f64 rtt_ts; /**< Timestamp for tracked ACK */
f64 mrtt_us; /**< High precision mrtt from tracked acks */
- u16 mss; /**< Our max seg size that includes options */
- u32 limited_transmit; /**< snd_nxt when limited transmit starts */
- u32 last_fib_check; /**< Last time we checked fib route for peer */
- u32 sw_if_index; /**< Interface for the connection */
- u32 tx_fifo_size; /**< Tx fifo size. Used to constrain cwnd */
-
u32 psh_seq; /**< Add psh header for seg that includes this */
u32 next_node_index; /**< Can be used to control next node in output */
u32 next_node_opaque; /**< Opaque to pass to next node */
+ u32 limited_transmit; /**< snd_nxt when limited transmit starts */
+ u32 sw_if_index; /**< Interface for the connection */
+
+ /* Delivery rate estimation */
+ u64 delivered; /**< Total bytes delivered to peer */
+ u64 app_limited; /**< Delivered when app-limited detected */
+ f64 delivered_time; /**< Time last bytes were acked */
+ tcp_byte_tracker_t *bt; /**< Tx byte tracker */
+
+ u32 last_fib_check; /**< Last time we checked fib route for peer */
+ u16 mss; /**< Our max seg size that includes options */
} tcp_connection_t;
/* *INDENT-OFF* */
@@ -332,8 +378,9 @@
{
const char *name;
uword (*unformat_cfg) (unformat_input_t * input);
- void (*rcv_ack) (tcp_connection_t * tc);
- void (*rcv_cong_ack) (tcp_connection_t * tc, tcp_cc_ack_t ack);
+ void (*rcv_ack) (tcp_connection_t * tc, tcp_rate_sample_t *rs);
+ void (*rcv_cong_ack) (tcp_connection_t * tc, tcp_cc_ack_t ack,
+ tcp_rate_sample_t *rs);
void (*congestion) (tcp_connection_t * tc);
void (*recovered) (tcp_connection_t * tc);
void (*init) (tcp_connection_t * tc);
@@ -636,6 +683,66 @@
void tcp_program_dupack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc);
void tcp_send_acks (tcp_worker_ctx_t * wrk);
+/*
+ * Rate estimation
+ */
+
+/**
+ * Byte tracker initialize
+ *
+ * @param tc connection for which the byte tracker should be allocated and
+ * initialized
+ */
+void tcp_bt_init (tcp_connection_t * tc);
+/**
+ * Byte tracker cleanup
+ *
+ * @param tc connection for which the byte tracker should be cleaned up
+ */
+void tcp_bt_cleanup (tcp_connection_t * tc);
+/**
+ * Flush byte tracker samples
+ *
+ * @param tc tcp connection for which samples should be flushed
+ */
+void tcp_bt_flush_samples (tcp_connection_t * tc);
+/**
+ * Track a tcp tx burst
+ *
+ * @param tc tcp connection
+ */
+void tcp_bt_track_tx (tcp_connection_t * tc);
+/**
+ * Track a tcp retransmission
+ *
+ * @param tc tcp connection
+ * @param start start sequence number
+ * @param end end sequence number
+ */
+void tcp_bt_track_rxt (tcp_connection_t * tc, u32 start, u32 end);
+/**
+ * Generate a delivery rate sample from recently acked bytes
+ *
+ * @param tc tcp connection
+ * @param rs resulting rate sample
+ */
+void tcp_bt_sample_delivery_rate (tcp_connection_t * tc,
+ tcp_rate_sample_t * rs);
+/**
+ * Check if sample to be generated is app limited
+ *
+ * @param tc tcp connection
+ */
+void tcp_bt_check_app_limited (tcp_connection_t * tc);
+/**
+ * Check if the byte tracker is in sane state
+ *
+ * Should be used only for testing
+ *
+ * @param bt byte tracker
+ */
+int tcp_bt_is_sane (tcp_byte_tracker_t * bt);
+
always_inline u32
tcp_end_seq (tcp_header_t * th, u32 len)
{
@@ -825,12 +932,19 @@
u32 start_bucket);
always_inline void
-tcp_cc_rcv_ack (tcp_connection_t * tc)
+tcp_cc_rcv_ack (tcp_connection_t * tc, tcp_rate_sample_t * rs)
{
- tc->cc_algo->rcv_ack (tc);
+ tc->cc_algo->rcv_ack (tc, rs);
tc->tsecr_last_ack = tc->rcv_opts.tsecr;
}
+static inline void
+tcp_cc_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type,
+ tcp_rate_sample_t * rs)
+{
+ tc->cc_algo->rcv_cong_ack (tc, ack_type, rs);
+}
+
always_inline void
tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval)
{
@@ -959,7 +1073,8 @@
return (void *) tc->cc_data;
}
-void newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type);
+void newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type,
+ tcp_rate_sample_t * rs);
/**
* Push TCP header to buffer
diff --git a/src/vnet/tcp/tcp_bt.c b/src/vnet/tcp/tcp_bt.c
new file mode 100644
index 0000000..7494747
--- /dev/null
+++ b/src/vnet/tcp/tcp_bt.c
@@ -0,0 +1,586 @@
+/*
+ * Copyright (c) 2019 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * TCP byte tracker that can generate delivery rate estimates. Based on
+ * draft-cheng-iccrg-delivery-rate-estimation-00
+ */
+
+#include <vnet/tcp/tcp.h>
+
+static tcp_bt_sample_t *
+bt_get_sample (tcp_byte_tracker_t * bt, u32 bts_index)
+{
+ if (pool_is_free_index (bt->samples, bts_index))
+ return 0;
+ return pool_elt_at_index (bt->samples, bts_index);
+}
+
+static tcp_bt_sample_t *
+bt_next_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts)
+{
+ return bt_get_sample (bt, bts->next);
+}
+
+static tcp_bt_sample_t *
+bt_prev_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts)
+{
+ return bt_get_sample (bt, bts->prev);
+}
+
+static u32
+bt_sample_index (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts)
+{
+ if (!bts)
+ return TCP_BTS_INVALID_INDEX;
+ return bts - bt->samples;
+}
+
+static inline int
+bt_seq_lt (u32 a, u32 b)
+{
+ return seq_lt (a, b);
+}
+
+static tcp_bt_sample_t *
+bt_alloc_sample (tcp_byte_tracker_t * bt, u32 min_seq)
+{
+ tcp_bt_sample_t *bts;
+
+ pool_get_zero (bt->samples, bts);
+ bts->next = bts->prev = TCP_BTS_INVALID_INDEX;
+ bts->min_seq = min_seq;
+ rb_tree_add_custom (&bt->sample_lookup, bts->min_seq, bts - bt->samples,
+ bt_seq_lt);
+ return bts;
+}
+
+static void
+bt_free_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts)
+{
+ if (bts->prev != TCP_BTS_INVALID_INDEX)
+ {
+ tcp_bt_sample_t *prev = bt_prev_sample (bt, bts);
+ prev->next = bts->next;
+ }
+ else
+ bt->head = bts->next;
+
+ if (bts->next != TCP_BTS_INVALID_INDEX)
+ {
+ tcp_bt_sample_t *next = bt_next_sample (bt, bts);
+ next->prev = bts->prev;
+ }
+ else
+ bt->tail = bts->prev;
+
+ rb_tree_del_custom (&bt->sample_lookup, bts->min_seq, bt_seq_lt);
+ if (CLIB_DEBUG)
+ memset (bts, 0xfc, sizeof (*bts));
+ pool_put (bt->samples, bts);
+}
+
+static tcp_bt_sample_t *
+bt_lookup_seq (tcp_byte_tracker_t * bt, u32 seq)
+{
+ rb_tree_t *rt = &bt->sample_lookup;
+ rb_node_t *cur, *prev;
+ tcp_bt_sample_t *bts;
+
+ cur = rb_node (rt, rt->root);
+ if (rb_node_is_tnil (rt, cur))
+ return 0;
+
+ while (seq != cur->key)
+ {
+ prev = cur;
+ if (seq_lt (seq, cur->key))
+ cur = rb_node_left (rt, cur);
+ else
+ cur = rb_node_right (rt, cur);
+
+ if (rb_node_is_tnil (rt, cur))
+ {
+ /* Hit tnil as a left child. Find predecessor */
+ if (seq_lt (seq, prev->key))
+ {
+ cur = rb_tree_predecessor (rt, prev);
+ if (rb_node_is_tnil (rt, cur))
+ return 0;
+ bts = bt_get_sample (bt, cur->opaque);
+ }
+ /* Hit tnil as a right child */
+ else
+ {
+ bts = bt_get_sample (bt, prev->opaque);
+ }
+
+ if (seq_geq (seq, bts->min_seq))
+ return bts;
+
+ return 0;
+ }
+ }
+
+ if (!rb_node_is_tnil (rt, cur))
+ return bt_get_sample (bt, cur->opaque);
+
+ return 0;
+}
+
+static void
+bt_update_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts, u32 seq)
+{
+ rb_tree_del_custom (&bt->sample_lookup, bts->min_seq, bt_seq_lt);
+ bts->min_seq = seq;
+ rb_tree_add_custom (&bt->sample_lookup, bts->min_seq,
+ bt_sample_index (bt, bts), bt_seq_lt);
+}
+
+static tcp_bt_sample_t *
+bt_fix_overlapped (tcp_byte_tracker_t * bt, tcp_bt_sample_t * start,
+ u32 seq, u8 is_end)
+{
+ tcp_bt_sample_t *cur, *next;
+
+ cur = start;
+ while ((next = bt_next_sample (bt, cur)) && seq_lt (next->min_seq, seq))
+ {
+ bt_free_sample (bt, cur);
+ cur = next;
+ }
+
+ if (next)
+ {
+ bt_free_sample (bt, cur);
+ return next;
+ }
+
+ /* Overlapping current entirely */
+ if (is_end)
+ {
+ bt_free_sample (bt, cur);
+ return 0;
+ }
+
+ /* Overlapping head of current but not all */
+ bt_update_sample (bt, cur, seq);
+ return cur;
+}
+
+int
+tcp_bt_is_sane (tcp_byte_tracker_t * bt)
+{
+ tcp_bt_sample_t *bts, *tmp;
+
+ if (pool_elts (bt->samples) != pool_elts (bt->sample_lookup.nodes) - 1)
+ return 0;
+
+ if (bt->head == TCP_BTS_INVALID_INDEX)
+ {
+ if (bt->tail != TCP_BTS_INVALID_INDEX)
+ return 0;
+ if (pool_elts (bt->samples) != 0)
+ return 0;
+ return 1;
+ }
+
+ bts = bt_get_sample (bt, bt->tail);
+ if (!bts)
+ return 0;
+
+ bts = bt_get_sample (bt, bt->head);
+ if (!bts || bts->prev != TCP_BTS_INVALID_INDEX)
+ return 0;
+
+ while (bts)
+ {
+ tmp = bt_lookup_seq (bt, bts->min_seq);
+ if (!tmp)
+ return 0;
+ if (tmp != bts)
+ return 0;
+ tmp = bt_next_sample (bt, bts);
+ if (tmp)
+ {
+ if (tmp->prev != bt_sample_index (bt, bts))
+ {
+ clib_warning ("next %u thinks prev is %u should be %u",
+ bts->next, tmp->prev, bt_sample_index (bt, bts));
+ return 0;
+ }
+ if (!seq_lt (bts->min_seq, tmp->min_seq))
+ return 0;
+ }
+ else
+ {
+ if (bt->tail != bt_sample_index (bt, bts))
+ return 0;
+ if (bts->next != TCP_BTS_INVALID_INDEX)
+ return 0;
+ }
+ bts = tmp;
+ }
+ return 1;
+}
+
+static tcp_bt_sample_t *
+tcp_bt_alloc_tx_sample (tcp_connection_t * tc, u32 min_seq)
+{
+ tcp_bt_sample_t *bts;
+ bts = bt_alloc_sample (tc->bt, min_seq);
+ bts->delivered = tc->delivered;
+ bts->delivered_time = tc->delivered_time;
+ bts->tx_rate = transport_connection_tx_pacer_rate (&tc->connection);
+ bts->flags |= tc->app_limited ? TCP_BTS_IS_APP_LIMITED : 0;
+ return bts;
+}
+
+void
+tcp_bt_check_app_limited (tcp_connection_t * tc)
+{
+ u32 available_bytes, flight_size;
+
+ available_bytes = transport_max_tx_dequeue (&tc->connection);
+ flight_size = tcp_flight_size (tc);
+
+ /* Not enough bytes to fill the cwnd */
+ if (available_bytes + flight_size + tc->snd_mss < tc->cwnd
+ /* Bytes considered lost have been retransmitted */
+ && tc->sack_sb.lost_bytes <= tc->snd_rxt_bytes)
+ tc->app_limited = tc->delivered + flight_size ? : 1;
+}
+
+void
+tcp_bt_track_tx (tcp_connection_t * tc)
+{
+ tcp_byte_tracker_t *bt = tc->bt;
+ tcp_bt_sample_t *bts, *tail;
+ u32 bts_index;
+
+ if (!tcp_flight_size (tc))
+ tc->delivered_time = tcp_time_now_us (tc->c_thread_index);
+
+ bts = tcp_bt_alloc_tx_sample (tc, tc->snd_nxt);
+ bts_index = bt_sample_index (bt, bts);
+ tail = bt_get_sample (bt, bt->tail);
+ if (tail)
+ {
+ tail->next = bts_index;
+ bts->prev = bt->tail;
+ bt->tail = bts_index;
+ }
+ else
+ {
+ bt->tail = bt->head = bts_index;
+ }
+}
+
+void
+tcp_bt_track_rxt (tcp_connection_t * tc, u32 start, u32 end)
+{
+ tcp_byte_tracker_t *bt = tc->bt;
+ tcp_bt_sample_t *bts, *next, *cur, *prev, *nbts;
+ u32 bts_index, cur_index, next_index, prev_index, min_seq;
+ u8 is_end = end == tc->snd_nxt;
+
+ bts = bt_get_sample (bt, bt->last_ooo);
+ if (bts && bts->max_seq == start)
+ {
+ bts->max_seq = end;
+ next = bt_next_sample (bt, bts);
+ if (next)
+ bt_fix_overlapped (bt, next, end, is_end);
+
+ return;
+ }
+
+ /* Find original tx sample */
+ bts = bt_lookup_seq (bt, start);
+
+ ASSERT (bts != 0 && seq_geq (start, bts->min_seq));
+
+ /* Head in the past */
+ if (seq_lt (bts->min_seq, tc->snd_una))
+ bt_update_sample (bt, bts, tc->snd_una);
+
+ /* Head overlap */
+ if (bts->min_seq == start)
+ {
+ prev_index = bts->prev;
+ next = bt_fix_overlapped (bt, bts, end, is_end);
+ next_index = bt_sample_index (bt, next);
+
+ cur = tcp_bt_alloc_tx_sample (tc, start);
+ cur->max_seq = end;
+ cur->flags |= TCP_BTS_IS_RXT;
+ cur->next = next_index;
+ cur->prev = prev_index;
+
+ cur_index = bt_sample_index (bt, cur);
+
+ if (next_index != TCP_BTS_INVALID_INDEX)
+ {
+ next = bt_get_sample (bt, next_index);
+ next->prev = cur_index;
+ }
+ else
+ {
+ bt->tail = cur_index;
+ }
+
+ if (prev_index != TCP_BTS_INVALID_INDEX)
+ {
+ prev = bt_get_sample (bt, prev_index);
+ prev->next = cur_index;
+ }
+ else
+ {
+ bt->head = cur_index;
+ }
+
+ bt->last_ooo = cur_index;
+ return;
+ }
+
+ bts_index = bt_sample_index (bt, bts);
+ next = bt_next_sample (bt, bts);
+ if (next)
+ next = bt_fix_overlapped (bt, next, end, is_end);
+
+ min_seq = next ? next->min_seq : tc->snd_nxt;
+ ASSERT (seq_lt (start, min_seq));
+
+ /* Have to split or tail overlap */
+ cur = tcp_bt_alloc_tx_sample (tc, start);
+ cur->max_seq = end;
+ cur->flags |= TCP_BTS_IS_RXT;
+ cur->prev = bts_index;
+ cur_index = bt_sample_index (bt, cur);
+
+ /* Split. Allocate another sample */
+ if (seq_lt (end, min_seq))
+ {
+ nbts = tcp_bt_alloc_tx_sample (tc, end);
+ cur = bt_get_sample (bt, cur_index);
+ bts = bt_get_sample (bt, bts_index);
+
+ *nbts = *bts;
+ nbts->min_seq = end;
+
+ if (nbts->next != TCP_BTS_INVALID_INDEX)
+ {
+ next = bt_get_sample (bt, nbts->next);
+ next->prev = bt_sample_index (bt, nbts);
+ }
+ else
+ bt->tail = bt_sample_index (bt, nbts);
+
+ bts->next = nbts->prev = cur_index;
+ cur->next = bt_sample_index (bt, nbts);
+
+ bt->last_ooo = cur_index;
+ }
+ /* Tail completely overlapped */
+ else
+ {
+ bts = bt_get_sample (bt, bts_index);
+
+ if (bts->next != TCP_BTS_INVALID_INDEX)
+ {
+ next = bt_get_sample (bt, bts->next);
+ next->prev = cur_index;
+ }
+ else
+ bt->tail = cur_index;
+
+ cur->next = bts->next;
+ bts->next = cur_index;
+
+ bt->last_ooo = cur_index;
+ }
+}
+
+static void
+tcp_bt_sample_to_rate_sample (tcp_connection_t * tc, tcp_bt_sample_t * bts,
+ tcp_rate_sample_t * rs)
+{
+ if (rs->sample_delivered && rs->sample_delivered >= bts->delivered)
+ return;
+
+ rs->sample_delivered = bts->delivered;
+ rs->delivered = tc->delivered - bts->delivered;
+ rs->ack_time = tc->delivered_time - bts->delivered_time;
+ rs->tx_rate = bts->tx_rate;
+ rs->flags = bts->flags;
+}
+
+static void
+tcp_bt_walk_samples (tcp_connection_t * tc, tcp_rate_sample_t * rs)
+{
+ tcp_byte_tracker_t *bt = tc->bt;
+ tcp_bt_sample_t *next, *cur;
+
+ cur = bt_get_sample (bt, bt->head);
+ tcp_bt_sample_to_rate_sample (tc, cur, rs);
+ while ((next = bt_get_sample (bt, cur->next))
+ && seq_lt (next->min_seq, tc->snd_una))
+ {
+ bt_free_sample (bt, cur);
+ tcp_bt_sample_to_rate_sample (tc, next, rs);
+ cur = next;
+ }
+
+ ASSERT (seq_lt (cur->min_seq, tc->snd_una));
+
+ /* All samples acked */
+ if (tc->snd_una == tc->snd_nxt)
+ {
+ ASSERT (pool_elts (bt->samples) == 1);
+ bt_free_sample (bt, cur);
+ return;
+ }
+
+ /* Current sample completely consumed */
+ if (next && next->min_seq == tc->snd_una)
+ {
+ bt_free_sample (bt, cur);
+ cur = next;
+ }
+}
+
+static void
+tcp_bt_walk_samples_ooo (tcp_connection_t * tc, tcp_rate_sample_t * rs)
+{
+ sack_block_t *blks = tc->rcv_opts.sacks, *blk;
+ tcp_byte_tracker_t *bt = tc->bt;
+ tcp_bt_sample_t *next, *cur;
+ int i;
+
+ for (i = 0; i < vec_len (blks); i++)
+ {
+ blk = &blks[i];
+
+ /* Ignore blocks that are already covered by snd_una */
+ if (seq_lt (blk->end, tc->snd_una))
+ continue;
+
+ cur = bt_lookup_seq (bt, blk->start);
+ if (!cur)
+ continue;
+
+ tcp_bt_sample_to_rate_sample (tc, cur, rs);
+
+ /* Current shouldn't be removed */
+ if (cur->min_seq != blk->start)
+ {
+ cur = bt_next_sample (bt, cur);
+ if (!cur)
+ continue;
+ }
+
+ while ((next = bt_get_sample (bt, cur->next))
+ && seq_lt (next->min_seq, blk->end))
+ {
+ bt_free_sample (bt, cur);
+ tcp_bt_sample_to_rate_sample (tc, next, rs);
+ cur = next;
+ }
+
+ /* Current consumed entirely */
+ if (next && next->min_seq == blk->end)
+ bt_free_sample (bt, cur);
+ }
+}
+
+void
+tcp_bt_sample_delivery_rate (tcp_connection_t * tc, tcp_rate_sample_t * rs)
+{
+ u32 delivered;
+
+ if (PREDICT_FALSE (tc->flags & TCP_CONN_FINSNT))
+ return;
+
+ delivered = tc->bytes_acked + tc->sack_sb.last_sacked_bytes;
+ if (!delivered || tc->bt->head == TCP_BTS_INVALID_INDEX)
+ return;
+
+ /* Do not count bytes that were previously sacked again */
+ tc->delivered += delivered - tc->sack_sb.last_bytes_delivered;
+ tc->delivered_time = tcp_time_now_us (tc->c_thread_index);
+
+ if (tc->app_limited && tc->delivered > tc->app_limited)
+ tc->app_limited = 0;
+
+ if (tc->bytes_acked)
+ tcp_bt_walk_samples (tc, rs);
+
+ if (tc->sack_sb.last_sacked_bytes)
+ tcp_bt_walk_samples_ooo (tc, rs);
+}
+
+void
+tcp_bt_flush_samples (tcp_connection_t * tc)
+{
+ tcp_byte_tracker_t *bt = tc->bt;
+ tcp_bt_sample_t *bts;
+ u32 *samples = 0, *si;
+
+ vec_validate (samples, pool_elts (bt->samples) - 1);
+
+ /* *INDENT-OFF* */
+ pool_foreach (bts, bt->samples, ({
+ vec_add1 (samples, bts - bt->samples);
+ }));
+ /* *INDENT-ON* */
+
+ vec_foreach (si, samples)
+ {
+ bts = bt_get_sample (bt, *si);
+ bt_free_sample (bt, bts);
+ }
+
+ vec_free (samples);
+}
+
+void
+tcp_bt_cleanup (tcp_connection_t * tc)
+{
+ tcp_byte_tracker_t *bt = tc->bt;
+
+ rb_tree_free_nodes (&bt->sample_lookup);
+ pool_free (bt->samples);
+ clib_mem_free (bt);
+ tc->bt = 0;
+}
+
+void
+tcp_bt_init (tcp_connection_t * tc)
+{
+ tcp_byte_tracker_t *bt;
+
+ bt = clib_mem_alloc (sizeof (tcp_byte_tracker_t));
+ clib_memset (bt, 0, sizeof (tcp_byte_tracker_t));
+
+ rb_tree_init (&bt->sample_lookup);
+ bt->head = bt->tail = TCP_BTS_INVALID_INDEX;
+ tc->bt = bt;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/tcp/tcp_cubic.c b/src/vnet/tcp/tcp_cubic.c
index 3815627..80d4308 100644
--- a/src/vnet/tcp/tcp_cubic.c
+++ b/src/vnet/tcp/tcp_cubic.c
@@ -130,7 +130,7 @@
}
static void
-cubic_rcv_ack (tcp_connection_t * tc)
+cubic_rcv_ack (tcp_connection_t * tc, tcp_rate_sample_t * rs)
{
cubic_data_t *cd = (cubic_data_t *) tcp_cc_data (tc);
u64 w_cubic, w_aimd;
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index 6c78af0..944b8eb 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -1264,12 +1264,12 @@
}
static void
-tcp_cc_update (tcp_connection_t * tc, vlib_buffer_t * b)
+tcp_cc_update (tcp_connection_t * tc, tcp_rate_sample_t * rs)
{
ASSERT (!tcp_in_cong_recovery (tc) || tcp_is_lost_fin (tc));
/* Congestion avoidance */
- tcp_cc_rcv_ack (tc);
+ tcp_cc_rcv_ack (tc, rs);
/* If a cumulative ack, make sure dupacks is 0 */
tc->rcv_dupacks = 0;
@@ -1376,7 +1376,8 @@
* One function to rule them all ... and in the darkness bind them
*/
static void
-tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
+tcp_cc_handle_event (tcp_connection_t * tc, tcp_rate_sample_t * rs,
+ u32 is_dack)
{
u32 rxt_delivered;
@@ -1402,7 +1403,7 @@
if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD && !tc->bytes_acked)
{
ASSERT (tcp_in_fastrecovery (tc));
- tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
+ tcp_cc_rcv_cong_ack (tc, TCP_CC_DUPACK, rs);
return;
}
else if (tcp_should_fastrecover (tc))
@@ -1421,7 +1422,7 @@
}
tcp_cc_init_congestion (tc);
- tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
+ tcp_cc_rcv_cong_ack (tc, TCP_CC_DUPACK, rs);
if (tcp_opts_sack_permitted (&tc->rcv_opts))
{
@@ -1447,7 +1448,7 @@
else if (!tc->bytes_acked
|| (tc->bytes_acked && !tcp_in_cong_recovery (tc)))
{
- tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
+ tcp_cc_rcv_cong_ack (tc, TCP_CC_DUPACK, rs);
return;
}
else
@@ -1502,7 +1503,7 @@
}
/* Treat as congestion avoidance ack */
- tcp_cc_rcv_ack (tc);
+ tcp_cc_rcv_ack (tc, rs);
return;
}
@@ -1520,7 +1521,7 @@
/* Post RTO timeout don't try anything fancy */
if (tcp_in_recovery (tc))
{
- tcp_cc_rcv_ack (tc);
+ tcp_cc_rcv_ack (tc, rs);
transport_add_tx_event (&tc->connection);
return;
}
@@ -1557,7 +1558,7 @@
tc->snd_rxt_bytes = 0;
}
- tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK);
+ tcp_cc_rcv_cong_ack (tc, TCP_CC_PARTIALACK, rs);
/*
* Since this was a partial ack, try to retransmit some more data
@@ -1573,6 +1574,7 @@
tcp_header_t * th, u32 * error)
{
u32 prev_snd_wnd, prev_snd_una;
+ tcp_rate_sample_t rs = { 0 };
u8 is_dack;
TCP_EVT_DBG (TCP_EVT_CC_STAT, tc);
@@ -1602,7 +1604,7 @@
TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 1,
vnet_buffer (b)->tcp.ack_number);
if (tcp_in_fastrecovery (tc) && tc->rcv_dupacks == TCP_DUPACK_THRESHOLD)
- tcp_cc_handle_event (tc, 1);
+ tcp_cc_handle_event (tc, 0, 1);
/* Don't drop yet */
return 0;
}
@@ -1630,6 +1632,9 @@
tcp_update_rtt (tc, vnet_buffer (b)->tcp.ack_number);
}
+ if (tc->flags & TCP_CONN_RATE_SAMPLE)
+ tcp_bt_sample_delivery_rate (tc, &rs);
+
TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc);
/*
@@ -1638,7 +1643,7 @@
if (tcp_ack_is_cc_event (tc, b, prev_snd_wnd, prev_snd_una, &is_dack))
{
- tcp_cc_handle_event (tc, is_dack);
+ tcp_cc_handle_event (tc, &rs, is_dack);
if (!tcp_in_cong_recovery (tc))
{
*error = TCP_ERROR_ACK_OK;
@@ -1653,7 +1658,7 @@
/*
* Update congestion control (slow start/congestion avoidance)
*/
- tcp_cc_update (tc, b);
+ tcp_cc_update (tc, &rs);
*error = TCP_ERROR_ACK_OK;
return 0;
}
diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c
index c40e443..3887b34 100644
--- a/src/vnet/tcp/tcp_newreno.c
+++ b/src/vnet/tcp/tcp_newreno.c
@@ -28,7 +28,7 @@
}
void
-newreno_rcv_ack (tcp_connection_t * tc)
+newreno_rcv_ack (tcp_connection_t * tc, tcp_rate_sample_t * rs)
{
if (tcp_in_slowstart (tc))
{
@@ -42,7 +42,8 @@
}
void
-newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type)
+newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type,
+ tcp_rate_sample_t * rs)
{
if (ack_type == TCP_CC_DUPACK)
{
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index 1adac95..d3c4ca4 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -422,6 +422,9 @@
&tc->snd_opts);
tcp_update_rcv_wnd (tc);
+
+ if (tc->flags & TCP_CONN_RATE_SAMPLE)
+ tc->flags |= TCP_CONN_TRACK_BURST;
}
void
@@ -1129,8 +1132,17 @@
tcp_session_push_header (transport_connection_t * tconn, vlib_buffer_t * b)
{
tcp_connection_t *tc = (tcp_connection_t *) tconn;
+
+ if (tc->flags & TCP_CONN_TRACK_BURST)
+ {
+ tcp_bt_check_app_limited (tc);
+ tcp_bt_track_tx (tc);
+ tc->flags &= ~TCP_CONN_TRACK_BURST;
+ }
+
tcp_push_hdr_i (tc, b, tc->snd_nxt, /* compute opts */ 0, /* burst */ 1,
/* update_snd_nxt */ 1);
+
tc->snd_una_max = seq_max (tc->snd_nxt, tc->snd_una_max);
tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
/* If not tracking an ACK, start tracking */
@@ -1418,7 +1430,11 @@
return 0;
if (tcp_in_fastrecovery (tc))
- tc->snd_rxt_bytes += n_bytes;
+ {
+ tc->snd_rxt_bytes += n_bytes;
+ if (tc->flags & TCP_CONN_RATE_SAMPLE)
+ tcp_bt_track_rxt (tc, start, start + n_bytes);
+ }
done:
TCP_EVT_DBG (TCP_EVT_CC_RTX, tc, offset, n_bytes);
@@ -1540,6 +1556,9 @@
else
scoreboard_clear (&tc->sack_sb);
+ if (tc->flags & TCP_CONN_RATE_SAMPLE)
+ tcp_bt_flush_samples (tc);
+
/* If we've sent beyond snd_congestion, update it */
tc->snd_congestion = seq_max (tc->snd_nxt, tc->snd_congestion);