tcp: fast retransmit improvements
Patch is too large to be ported to 18.10 just days before release.
- handle fast retransmits outside of established node and limit the
retransmit burst size to avoid tx losses and worsening congestion.
- in the absance of a tx pacer, use slow start after fast retransmit
exists
- add fast retransmit heuristic that re-retries sending the first
segment if everything else fails
- fine tuning
Change-Id: I84a2ab8fbba8b97f1d2b26584dc11a1e2c33c8d2
Signed-off-by: Florin Coras <fcoras@cisco.com>
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
index e32b5c4..cb05b8c 100644
--- a/src/vnet/tcp/tcp.c
+++ b/src/vnet/tcp/tcp.c
@@ -950,7 +950,8 @@
hole = scoreboard_first_hole (sb);
if (hole)
- s = format (s, "\n head %u tail %u holes:\n", sb->head, sb->tail);
+ s = format (s, "\n head %u tail %u %u holes:\n", sb->head, sb->tail,
+ pool_elts (sb->holes));
while (hole)
{
@@ -1027,7 +1028,7 @@
{
int snd_space, snt_limited;
- if (PREDICT_TRUE (tcp_in_cong_recovery (tc) == 0))
+ if (PREDICT_TRUE (!tcp_in_fastrecovery (tc)))
{
snd_space = tcp_available_output_snd_space (tc);
@@ -1047,16 +1048,6 @@
return tcp_round_snd_space (tc, snd_space);
}
- if (tcp_in_recovery (tc))
- {
- tc->snd_nxt = tc->snd_una_max;
- snd_space = tcp_available_snd_wnd (tc) - tc->snd_rxt_bytes
- - (tc->snd_una_max - tc->snd_congestion);
- if (snd_space <= 0 || (tc->snd_una_max - tc->snd_una) >= tc->snd_wnd)
- return 0;
- return tcp_round_snd_space (tc, snd_space);
- }
-
/* RFC 5681: When previously unsent data is available and the new value of
* cwnd and the receiver's advertised window allow, a TCP SHOULD send 1*SMSS
* bytes of previously unsent data. */
@@ -1103,6 +1094,7 @@
tw_timer_expire_timers_16t_2w_512sl (&tcp_main.
wrk_ctx[thread_index].timer_wheel,
now);
+ tcp_do_fastretransmits (thread_index);
tcp_flush_frames_to_output (thread_index);
}
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index 165659b..a036072 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -120,6 +120,8 @@
_(FR_1_SMSS, "Sent 1 SMSS") \
_(HALF_OPEN_DONE, "Half-open completed") \
_(FINPNDG, "FIN pending") \
+ _(FRXT_PENDING, "Fast-retransmit pending") \
+ _(FRXT_FIRST, "Fast-retransmit first again") \
typedef enum _tcp_connection_flag_bits
{
@@ -345,6 +347,9 @@
#define tcp_fastrecovery_sent_1_smss(tc) ((tc)->flags & TCP_CONN_FR_1_SMSS)
#define tcp_fastrecovery_1_smss_on(tc) ((tc)->flags |= TCP_CONN_FR_1_SMSS)
#define tcp_fastrecovery_1_smss_off(tc) ((tc)->flags &= ~TCP_CONN_FR_1_SMSS)
+#define tcp_fastrecovery_first(tc) ((tc)->flags & TCP_CONN_FRXT_FIRST)
+#define tcp_fastrecovery_first_on(tc) ((tc)->flags |= TCP_CONN_FRXT_FIRST)
+#define tcp_fastrecovery_first_off(tc) ((tc)->flags &= ~TCP_CONN_FRXT_FIRST)
#define tcp_in_cong_recovery(tc) ((tc)->flags & \
(TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY))
@@ -354,6 +359,7 @@
{
tc->flags &= ~(TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY);
tcp_fastrecovery_1_smss_off (tc);
+ tcp_fastrecovery_first_off (tc);
}
typedef enum _tcp_error
@@ -379,9 +385,15 @@
output nodes */
vlib_frame_t *ip_lookup_tx_frames[2]; /**< tx frames for ip 4/6
lookup nodes */
+ u32 *pending_fast_rxt; /**< vector of connections
+ needing fast rxt */
+ u32 *ongoing_fast_rxt; /**< vector of connections
+ now doing fast rxt */
+
CLIB_CACHE_LINE_ALIGN_MARK (cacheline1);
u8 cached_opts[40]; /**< cached 'on the wire'
options for bursts */
+
} tcp_worker_ctx_t;
typedef struct _tcp_main
@@ -542,6 +554,8 @@
void tcp_update_rto (tcp_connection_t * tc);
void tcp_flush_frame_to_output (vlib_main_t * vm, u8 thread_index, u8 is_ip4);
void tcp_flush_frames_to_output (u8 thread_index);
+void tcp_program_fastretransmit (tcp_connection_t * tc);
+void tcp_do_fastretransmits (u32 thread_index);
always_inline u32
tcp_end_seq (tcp_header_t * th, u32 len)
@@ -659,10 +673,10 @@
}
u32 tcp_snd_space (tcp_connection_t * tc);
-void tcp_retransmit_first_unacked (tcp_connection_t * tc);
-void tcp_fast_retransmit_no_sack (tcp_connection_t * tc);
-void tcp_fast_retransmit_sack (tcp_connection_t * tc);
-void tcp_fast_retransmit (tcp_connection_t * tc);
+int tcp_retransmit_first_unacked (tcp_connection_t * tc);
+int tcp_fast_retransmit_no_sack (tcp_connection_t * tc, u32 burst_size);
+int tcp_fast_retransmit_sack (tcp_connection_t * tc, u32 burst_size);
+int tcp_fast_retransmit (tcp_connection_t * tc, u32 burst_size);
void tcp_cc_init_congestion (tcp_connection_t * tc);
void tcp_cc_fastrecovery_exit (tcp_connection_t * tc);
diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h
index ccf12da..8f626b1 100755
--- a/src/vnet/tcp/tcp_debug.h
+++ b/src/vnet/tcp/tcp_debug.h
@@ -629,6 +629,8 @@
#define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...) \
{ \
+ if (_tc->snd_una != _tc->iss) \
+ TCP_EVT_CC_STAT_PRINT (_tc); \
ELOG_TYPE_DECLARE (_e) = \
{ \
.format = "cc: %s snd_space %u snd_una %u out %u flight %u", \
@@ -788,9 +790,11 @@
#define STATS_INTERVAL 1
-#define TCP_EVT_CC_RTO_STAT_HANDLER(_tc, ...) \
-{ \
-if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now()) \
+#define tcp_cc_time_to_print_stats(_tc) \
+ _tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now() \
+ || tcp_in_fastrecovery (_tc) \
+
+#define TCP_EVT_CC_RTO_STAT_PRINT(_tc) \
{ \
ELOG_TYPE_DECLARE (_e) = \
{ \
@@ -801,29 +805,40 @@
ed->data[0] = _tc->rto; \
ed->data[1] = _tc->srtt; \
ed->data[2] = _tc->rttvar; \
+}
+
+#define TCP_EVT_CC_RTO_STAT_HANDLER(_tc, ...) \
+{ \
+if (tcp_cc_time_to_print_stats (_tc)) \
+{ \
+ TCP_EVT_CC_RTO_STAT_PRINT (_tc); \
} \
}
-#define TCP_EVT_CC_SND_STAT_HANDLER(_tc, ...) \
-{ \
-if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now()) \
+
+#define TCP_EVT_CC_SND_STAT_PRINT(_tc) \
{ \
ELOG_TYPE_DECLARE (_e) = \
{ \
- .format = "snd_stat: dack %u sacked %u lost %u out %u rxt %u", \
+ .format = "snd_stat: cc_space %u sacked %u lost %u out %u rxt %u", \
.format_args = "i4i4i4i4i4", \
}; \
DECLARE_ETD(_tc, _e, 5); \
- ed->data[0] = _tc->rcv_dupacks; \
+ ed->data[0] = tcp_available_cc_snd_space (_tc); \
ed->data[1] = _tc->sack_sb.sacked_bytes; \
ed->data[2] = _tc->sack_sb.lost_bytes; \
ed->data[3] = tcp_bytes_out (_tc); \
ed->data[3] = _tc->snd_rxt_bytes; \
+}
+
+#define TCP_EVT_CC_SND_STAT_HANDLER(_tc, ...) \
+{ \
+if (tcp_cc_time_to_print_stats (_tc)) \
+{ \
+ TCP_EVT_CC_SND_STAT_PRINT(_tc); \
} \
}
-#define TCP_EVT_CC_STAT_HANDLER(_tc, ...) \
-{ \
-if (_tc->c_cc_stat_tstamp + STATS_INTERVAL < tcp_time_now()) \
+#define TCP_EVT_CC_STAT_PRINT(_tc) \
{ \
ELOG_TYPE_DECLARE (_e) = \
{ \
@@ -836,7 +851,15 @@
ed->data[2] = tcp_snd_space (_tc); \
ed->data[3] = _tc->ssthresh; \
ed->data[4] = _tc->snd_wnd; \
- TCP_EVT_CC_RTO_STAT_HANDLER (_tc); \
+ TCP_EVT_CC_RTO_STAT_PRINT (_tc); \
+ TCP_EVT_CC_SND_STAT_PRINT (_tc); \
+}
+
+#define TCP_EVT_CC_STAT_HANDLER(_tc, ...) \
+{ \
+if (tcp_cc_time_to_print_stats (_tc)) \
+{ \
+ TCP_EVT_CC_STAT_PRINT (_tc); \
_tc->c_cc_stat_tstamp = tcp_time_now(); \
} \
}
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index 4e3987e..39a538b 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -749,7 +749,7 @@
/* Rule (3): if hole not lost */
else if (seq_lt (hole->start, sb->high_sacked))
{
- *snd_limited = 1;
+ *snd_limited = 0;
sb->cur_rxt_hole = scoreboard_hole_index (sb, hole);
}
/* Rule (4): if hole beyond high_sacked */
@@ -993,10 +993,10 @@
sb->last_sacked_bytes = sb->sacked_bytes
- (old_sacked_bytes - sb->last_bytes_delivered);
ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes || tcp_in_recovery (tc));
- ASSERT (sb->sacked_bytes == 0
+ ASSERT (sb->sacked_bytes == 0 || tcp_in_recovery (tc)
|| sb->sacked_bytes < tc->snd_una_max - seq_max (tc->snd_una, ack));
ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_una_max
- - seq_max (tc->snd_una, ack));
+ - seq_max (tc->snd_una, ack) || tcp_in_recovery (tc));
ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc)
|| sb->holes[sb->head].start == ack + sb->snd_una_adv);
TCP_EVT_DBG (TCP_EVT_CC_SCOREBOARD, tc);
@@ -1052,6 +1052,9 @@
tcp_fastrecovery_on (tc);
tc->snd_congestion = tc->snd_una_max;
tc->cwnd_acc_bytes = 0;
+ tc->snd_rxt_bytes = 0;
+ tc->prev_ssthresh = tc->ssthresh;
+ tc->prev_cwnd = tc->cwnd;
tc->cc_algo->congestion (tc);
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 4);
}
@@ -1074,8 +1077,14 @@
tc->snd_rxt_bytes = 0;
tc->rcv_dupacks = 0;
tc->snd_nxt = tc->snd_una_max;
+ tc->snd_rxt_bytes = 0;
+
+ /* HACK: since we don't have an output pacer, force slow start */
+ tc->cwnd = 20 * tc->snd_mss;
+
tcp_fastrecovery_off (tc);
tcp_fastrecovery_1_smss_off (tc);
+ tcp_fastrecovery_first_off (tc);
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3);
}
@@ -1088,13 +1097,14 @@
tc->rcv_dupacks = 0;
if (tcp_in_recovery (tc))
tcp_cc_recovery_exit (tc);
+ else if (tcp_in_fastrecovery (tc))
+ tcp_cc_fastrecovery_exit (tc);
ASSERT (tc->rto_boff == 0);
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 5);
- /* TODO extend for fastrecovery */
}
-static u8
-tcp_cc_is_spurious_retransmit (tcp_connection_t * tc)
+static inline u8
+tcp_cc_is_spurious_timeout_rxt (tcp_connection_t * tc)
{
return (tcp_in_recovery (tc) && tc->rto_boff == 1
&& tc->snd_rxt_ts
@@ -1102,6 +1112,20 @@
&& timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts));
}
+static inline u8
+tcp_cc_is_spurious_fast_rxt (tcp_connection_t * tc)
+{
+ return (tcp_in_fastrecovery (tc)
+ && tc->cwnd > tc->ssthresh + 3 * tc->snd_mss);
+}
+
+static u8
+tcp_cc_is_spurious_retransmit (tcp_connection_t * tc)
+{
+ return (tcp_cc_is_spurious_timeout_rxt (tc)
+ || tcp_cc_is_spurious_fast_rxt (tc));
+}
+
static int
tcp_cc_recover (tcp_connection_t * tc)
{
@@ -1158,6 +1182,84 @@
|| tcp_should_fastrecover_sack (tc));
}
+void
+tcp_program_fastretransmit (tcp_connection_t * tc)
+{
+ tcp_worker_ctx_t *wrk = &tcp_main.wrk_ctx[tc->c_thread_index];
+ if (!(tc->flags & TCP_CONN_FRXT_PENDING))
+ {
+ vec_add1 (wrk->pending_fast_rxt, tc->c_c_index);
+ tc->flags |= TCP_CONN_FRXT_PENDING;
+ }
+}
+
+void
+tcp_do_fastretransmits (u32 thread_index)
+{
+ tcp_worker_ctx_t *wrk = &tcp_main.wrk_ctx[thread_index];
+ u32 max_burst_size, burst_size, n_segs = 0;
+ tcp_connection_t *tc;
+ int i;
+
+ if (vec_len (wrk->pending_fast_rxt) == 0)
+ return;
+
+ vec_append (wrk->ongoing_fast_rxt, wrk->pending_fast_rxt);
+ vec_reset_length (wrk->pending_fast_rxt);
+
+ max_burst_size = VLIB_FRAME_SIZE / vec_len (wrk->ongoing_fast_rxt);
+ max_burst_size = clib_max (max_burst_size, 1);
+
+ for (i = 0; i < vec_len (wrk->ongoing_fast_rxt); i++)
+ {
+ tc = tcp_connection_get (wrk->ongoing_fast_rxt[i], thread_index);
+ tc->flags &= ~TCP_CONN_FRXT_PENDING;
+
+ if (!tcp_in_fastrecovery (tc))
+ continue;
+
+ /* TODO tx pacer instead of this */
+ if (n_segs >= VLIB_FRAME_SIZE)
+ {
+ tcp_program_fastretransmit (tc);
+ continue;
+ }
+
+ burst_size = clib_min (max_burst_size, VLIB_FRAME_SIZE - n_segs);
+
+ if (tc->cwnd > tc->ssthresh + 3 * tc->snd_mss)
+ {
+ /* The first segment MUST be retransmitted */
+ if (tcp_retransmit_first_unacked (tc))
+ {
+ tcp_program_fastretransmit (tc);
+ continue;
+ }
+
+ /* Post retransmit update cwnd to ssthresh and account for the
+ * three segments that have left the network and should've been
+ * buffered at the receiver XXX */
+ tc->cwnd = tc->ssthresh + 3 * tc->snd_mss;
+
+ /* If cwnd allows, send more data */
+ if (tcp_opts_sack_permitted (&tc->rcv_opts))
+ {
+ scoreboard_init_high_rxt (&tc->sack_sb,
+ tc->snd_una + tc->snd_mss);
+ tc->sack_sb.rescue_rxt = tc->snd_una - 1;
+ n_segs += tcp_fast_retransmit_sack (tc, burst_size);
+ }
+ else
+ {
+ n_segs += tcp_fast_retransmit_no_sack (tc, burst_size);
+ }
+ }
+ else
+ n_segs += tcp_fast_retransmit (tc, burst_size);
+ }
+ vec_reset_length (wrk->ongoing_fast_rxt);
+}
+
/**
* One function to rule them all ... and in the darkness bind them
*/
@@ -1170,7 +1272,7 @@
{
if (tc->bytes_acked)
goto partial_ack;
- tcp_fast_retransmit (tc);
+ tcp_program_fastretransmit (tc);
return;
}
/*
@@ -1196,20 +1298,10 @@
{
ASSERT (!tcp_in_fastrecovery (tc));
- /* If of of the two conditions lower hold, reset dupacks because
- * we're probably after timeout (RFC6582 heuristics).
- * If Cumulative ack does not cover more than congestion threshold,
- * and:
- * 1) The following doesn't hold: The congestion window is greater
- * than SMSS bytes and the difference between highest_ack
- * and prev_highest_ack is at most 4*SMSS bytes
- * 2) Echoed timestamp in the last non-dup ack does not equal the
- * stored timestamp
- */
- if (seq_leq (tc->snd_una, tc->snd_congestion)
- && ((!(tc->cwnd > tc->snd_mss
- && tc->bytes_acked <= 4 * tc->snd_mss))
- || (tc->rcv_opts.tsecr != tc->tsecr_last_ack)))
+ /* Heuristic to catch potential late dupacks
+ * after fast retransmit exits */
+ if (is_dack && tc->snd_una == tc->snd_congestion
+ && timestamp_leq (tc->rcv_opts.tsecr, tc->tsecr_last_ack))
{
tc->rcv_dupacks = 0;
return;
@@ -1218,26 +1310,10 @@
tcp_cc_init_congestion (tc);
tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
- /* The first segment MUST be retransmitted */
- tcp_retransmit_first_unacked (tc);
-
- /* Post retransmit update cwnd to ssthresh and account for the
- * three segments that have left the network and should've been
- * buffered at the receiver XXX */
- tc->cwnd = tc->ssthresh + tc->rcv_dupacks * tc->snd_mss;
- ASSERT (tc->cwnd >= tc->snd_mss);
-
- /* If cwnd allows, send more data */
if (tcp_opts_sack_permitted (&tc->rcv_opts))
- {
- scoreboard_init_high_rxt (&tc->sack_sb,
- tc->snd_una + tc->snd_mss);
- tcp_fast_retransmit_sack (tc);
- }
- else
- {
- tcp_fast_retransmit_no_sack (tc);
- }
+ tc->sack_sb.high_rxt = tc->snd_una;
+
+ tcp_program_fastretransmit (tc);
return;
}
else if (!tc->bytes_acked
@@ -1249,6 +1325,28 @@
else
goto partial_ack;
}
+ /* Don't allow entry in fast recovery if still in recovery, for now */
+ else if (0 && is_dack && tcp_in_recovery (tc))
+ {
+ /* If of of the two conditions lower hold, reset dupacks because
+ * we're probably after timeout (RFC6582 heuristics).
+ * If Cumulative ack does not cover more than congestion threshold,
+ * and:
+ * 1) The following doesn't hold: The congestion window is greater
+ * than SMSS bytes and the difference between highest_ack
+ * and prev_highest_ack is at most 4*SMSS bytes
+ * 2) Echoed timestamp in the last non-dup ack does not equal the
+ * stored timestamp
+ */
+ if (seq_leq (tc->snd_una, tc->snd_congestion)
+ && ((!(tc->cwnd > tc->snd_mss
+ && tc->bytes_acked <= 4 * tc->snd_mss))
+ || (tc->rcv_opts.tsecr != tc->tsecr_last_ack)))
+ {
+ tc->rcv_dupacks = 0;
+ return;
+ }
+ }
if (!tc->bytes_acked)
return;
@@ -1259,14 +1357,11 @@
/*
* Legitimate ACK. 1) See if we can exit recovery
*/
- /* XXX limit this only to first partial ack? */
- if (seq_lt (tc->snd_una, tc->snd_congestion))
- tcp_retransmit_timer_force_update (tc);
- else
- tcp_retransmit_timer_update (tc);
if (seq_geq (tc->snd_una, tc->snd_congestion))
{
+ tcp_retransmit_timer_update (tc);
+
/* If spurious return, we've already updated everything */
if (tcp_cc_recover (tc))
{
@@ -1286,6 +1381,9 @@
* Legitimate ACK. 2) If PARTIAL ACK try to retransmit
*/
+ /* XXX limit this only to first partial ack? */
+ tcp_retransmit_timer_force_update (tc);
+
/* RFC6675: If the incoming ACK is a cumulative acknowledgment,
* reset dupacks to 0. Also needed if in congestion recovery */
tc->rcv_dupacks = 0;
@@ -1300,24 +1398,33 @@
}
/* Remove retransmitted bytes that have been delivered */
- ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv
- >= tc->sack_sb.last_bytes_delivered
- || (tc->flags & TCP_CONN_FINSNT));
-
- if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt))
+ if (tcp_opts_sack_permitted (&tc->rcv_opts))
{
+ ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv
+ >= tc->sack_sb.last_bytes_delivered
+ || (tc->flags & TCP_CONN_FINSNT));
+
/* If we have sacks and we haven't gotten an ack beyond high_rxt,
* remove sacked bytes delivered */
- rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv
- - tc->sack_sb.last_bytes_delivered;
- ASSERT (tc->snd_rxt_bytes >= rxt_delivered);
- tc->snd_rxt_bytes -= rxt_delivered;
+ if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt))
+ {
+ rxt_delivered = tc->bytes_acked + tc->sack_sb.snd_una_adv
+ - tc->sack_sb.last_bytes_delivered;
+ ASSERT (tc->snd_rxt_bytes >= rxt_delivered);
+ tc->snd_rxt_bytes -= rxt_delivered;
+ }
+ else
+ {
+ /* Apparently all retransmitted holes have been acked */
+ tc->snd_rxt_bytes = 0;
+ }
}
else
{
- /* Either all retransmitted holes have been acked, or we're
- * "in the blind" and retransmitting segment by segment */
- tc->snd_rxt_bytes = 0;
+ if (tc->snd_rxt_bytes > tc->bytes_acked)
+ tc->snd_rxt_bytes -= tc->bytes_acked;
+ else
+ tc->snd_rxt_bytes = 0;
}
tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK);
@@ -1325,7 +1432,7 @@
/*
* Since this was a partial ack, try to retransmit some more data
*/
- tcp_fast_retransmit (tc);
+ tcp_program_fastretransmit (tc);
}
/**
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index ed1c641..2e6036b 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -1409,7 +1409,11 @@
/* Cleanly recover cc (also clears up fast retransmit) */
if (tcp_in_fastrecovery (tc))
- tcp_cc_fastrecovery_exit (tc);
+ {
+ /* TODO be less aggressive about this */
+ scoreboard_clear (&tc->sack_sb);
+ tcp_cc_fastrecovery_exit (tc);
+ }
/* Start again from the beginning */
tc->cc_algo->congestion (tc);
@@ -1487,6 +1491,8 @@
/* First retransmit timeout */
if (tc->rto_boff == 1)
tcp_rxt_timeout_cc (tc);
+ else
+ scoreboard_clear (&tc->sack_sb);
/* If we've sent beyond snd_congestion, update it */
if (seq_gt (tc->snd_una_max, tc->snd_congestion))
@@ -1499,9 +1505,6 @@
* shortfall */
n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b);
- /* TODO be less aggressive about this */
- scoreboard_clear (&tc->sack_sb);
-
if (n_bytes == 0)
{
tcp_retransmit_timer_force_update (tc);
@@ -1680,7 +1683,7 @@
/**
* Retransmit first unacked segment
*/
-void
+int
tcp_retransmit_first_unacked (tcp_connection_t * tc)
{
vlib_main_t *vm = vlib_get_main ();
@@ -1691,20 +1694,23 @@
tc->snd_nxt = tc->snd_una;
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2);
+
n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b);
if (!n_bytes)
- return;
+ return -1;
+
bi = vlib_get_buffer_index (vm, b);
tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
-
tc->snd_nxt = old_snd_nxt;
+
+ return 0;
}
/**
* Do fast retransmit with SACKs
*/
-void
-tcp_fast_retransmit_sack (tcp_connection_t * tc)
+int
+tcp_fast_retransmit_sack (tcp_connection_t * tc, u32 burst_size)
{
vlib_main_t *vm = vlib_get_main ();
u32 n_written = 0, offset, max_bytes, n_segs = 0;
@@ -1720,13 +1726,16 @@
old_snd_nxt = tc->snd_nxt;
sb = &tc->sack_sb;
snd_space = tcp_available_cc_snd_space (tc);
+ hole = scoreboard_get_hole (sb, sb->cur_rxt_hole);
if (snd_space < tc->snd_mss)
- goto done;
+ {
+ tcp_program_fastretransmit (tc);
+ goto done;
+ }
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0);
- hole = scoreboard_get_hole (sb, sb->cur_rxt_hole);
- while (hole && snd_space > 0 && n_segs++ < VLIB_FRAME_SIZE)
+ while (snd_space > 0 && n_segs < burst_size)
{
hole = scoreboard_next_rxt_hole (sb, hole,
tcp_fastrecovery_sent_1_smss (tc),
@@ -1736,7 +1745,21 @@
if (!can_rescue || !(seq_lt (sb->rescue_rxt, tc->snd_una)
|| seq_gt (sb->rescue_rxt,
tc->snd_congestion)))
- break;
+ {
+ if (tcp_fastrecovery_first (tc))
+ break;
+
+ /* We tend to lose the first segment. Try re-resending
+ * it but only once and after we've tried everything */
+ hole = scoreboard_first_hole (sb);
+ if (hole && hole->start == tc->snd_una)
+ {
+ tcp_retransmit_first_unacked (tc);
+ tcp_fastrecovery_first_on (tc);
+ n_segs += 1;
+ }
+ break;
+ }
/* If rescue rxt undefined or less than snd_una then one segment of
* up to SMSS octets that MUST include the highest outstanding
@@ -1756,6 +1779,7 @@
bi = vlib_get_buffer_index (vm, b);
tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
+ n_segs += 1;
break;
}
@@ -1776,22 +1800,27 @@
tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
ASSERT (n_written <= snd_space);
snd_space -= n_written;
+ n_segs += 1;
}
+ if (hole)
+ tcp_program_fastretransmit (tc);
+
done:
/* If window allows, send 1 SMSS of new data */
tc->snd_nxt = old_snd_nxt;
+ return n_segs;
}
/**
* Fast retransmit without SACK info
*/
-void
-tcp_fast_retransmit_no_sack (tcp_connection_t * tc)
+int
+tcp_fast_retransmit_no_sack (tcp_connection_t * tc, u32 burst_size)
{
vlib_main_t *vm = vlib_get_main ();
u32 n_written = 0, offset = 0, bi, old_snd_nxt;
- int snd_space;
+ int snd_space, n_segs = 0;
vlib_buffer_t *b;
ASSERT (tcp_in_fastrecovery (tc));
@@ -1802,7 +1831,7 @@
tc->snd_nxt = tc->snd_una;
snd_space = tcp_available_cc_snd_space (tc);
- while (snd_space > 0)
+ while (snd_space > 0 && n_segs < burst_size)
{
offset += n_written;
n_written = tcp_prepare_retransmit_segment (tc, offset, snd_space, &b);
@@ -1814,22 +1843,29 @@
bi = vlib_get_buffer_index (vm, b);
tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
snd_space -= n_written;
+ n_segs += 1;
}
+ /* More data to resend */
+ if (seq_lt (tc->snd_nxt, tc->snd_congestion))
+ tcp_program_fastretransmit (tc);
+
/* Restore snd_nxt. If window allows, send 1 SMSS of new data */
tc->snd_nxt = old_snd_nxt;
+
+ return n_segs;
}
/**
* Do fast retransmit
*/
-void
-tcp_fast_retransmit (tcp_connection_t * tc)
+int
+tcp_fast_retransmit (tcp_connection_t * tc, u32 burst_size)
{
if (tcp_opts_sack_permitted (&tc->rcv_opts))
- tcp_fast_retransmit_sack (tc);
+ return tcp_fast_retransmit_sack (tc, burst_size);
else
- tcp_fast_retransmit_no_sack (tc);
+ return tcp_fast_retransmit_no_sack (tc, burst_size);
}
static u32