Fix TCP loss recovery, VPP-745
Allows pure loss recovery retransmits only on timeout.
Change-Id: I563cdbf9e7b890a6569350bdbda4f746ace0544e
Signed-off-by: Florin Coras <fcoras@cisco.com>
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
index de4edfa..e80e2ec 100644
--- a/src/vnet/tcp/tcp.c
+++ b/src/vnet/tcp/tcp.c
@@ -567,30 +567,49 @@
return tc->snd_mss;
}
+always_inline u32
+tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space)
+{
+ if (tc->snd_wnd < tc->snd_mss)
+ {
+ return tc->snd_wnd < snd_space ? tc->snd_wnd : 0;
+ }
+
+ /* If we can't write at least a segment, don't try at all */
+ if (snd_space < tc->snd_mss)
+ return 0;
+
+ /* round down to mss multiple */
+ return snd_space - (snd_space % tc->snd_mss);
+}
+
/**
* Compute tx window session is allowed to fill.
*/
u32
tcp_session_send_space (transport_connection_t * trans_conn)
{
- u32 snd_space, chunk;
+ int snd_space;
tcp_connection_t *tc = (tcp_connection_t *) trans_conn;
/* If we haven't gotten dupacks or if we did and have gotten sacked bytes
* then we can still send */
- if (PREDICT_TRUE (tcp_in_fastrecovery (tc) == 0
+ if (PREDICT_TRUE (tcp_in_cong_recovery (tc) == 0
&& (tc->rcv_dupacks == 0
|| tc->sack_sb.last_sacked_bytes)))
{
- chunk = tc->snd_wnd > tc->snd_mss ? tc->snd_mss : tc->snd_wnd;
snd_space = tcp_available_snd_space (tc);
+ return tcp_round_snd_space (tc, snd_space);
+ }
- /* If we can't write at least a segment, don't try at all */
- if (chunk == 0 || snd_space < chunk)
+ if (tcp_in_recovery (tc))
+ {
+ tc->snd_nxt = tc->snd_una_max;
+ snd_space = tcp_available_wnd (tc) - tc->rtx_bytes
+ - (tc->snd_una_max - tc->snd_congestion);
+ if (snd_space <= 0)
return 0;
-
- /* round down to mss multiple */
- return snd_space - (snd_space % chunk);
+ return tcp_round_snd_space (tc, snd_space);
}
/* If in fast recovery, send 1 SMSS if wnd allows */
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index f61a1b5..c75479d 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -24,8 +24,8 @@
#include <vnet/session/session.h>
#include <vnet/tcp/tcp_debug.h>
-#define TCP_TICK 10e-3 /**< TCP tick period (s) */
-#define THZ 1/TCP_TICK /**< TCP tick frequency */
+#define TCP_TICK 0.001 /**< TCP tick period (s) */
+#define THZ (u32) (1/TCP_TICK) /**< TCP tick frequency */
#define TCP_TSTAMP_RESOLUTION TCP_TICK /**< Time stamp resolution */
#define TCP_PAWS_IDLE 24 * 24 * 60 * 60 * THZ /**< 24 days */
#define TCP_MAX_OPTION_SPACE 40
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index 0030cfe..e9c52c5 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -392,11 +392,10 @@
/* Karn's rule, part 1. Don't use retransmitted segments to estimate
* RTT because they're ambiguous. */
- if (tc->rtt_seq && seq_gt (ack, tc->rtt_seq) && !tc->rto_boff)
+ if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq) && !tc->rto_boff)
{
mrtt = tcp_time_now () - tc->rtt_ts;
}
-
/* As per RFC7323 TSecr can be used for RTTM only if the segment advances
* snd_una, i.e., the left side of the send window:
* seq_lt (tc->snd_una, ack). Note: last condition could be dropped, we don't
@@ -406,19 +405,22 @@
mrtt = tcp_time_now () - tc->opt.tsecr;
}
+ /* Allow measuring of a new RTT */
+ tc->rtt_ts = 0;
+
+ /* If ACK moves left side of the wnd make sure boff is 0, even if mrtt is
+ * not valid */
+ if (tc->bytes_acked)
+ tc->rto_boff = 0;
+
/* Ignore dubious measurements */
if (mrtt == 0 || mrtt > TCP_RTT_MAX)
return 0;
tcp_estimate_rtt (tc, mrtt);
-
tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX);
- /* Allow measuring of RTT and make sure boff is 0 */
- tc->rtt_seq = 0;
- tc->rto_boff = 0;
-
- return 1;
+ return 0;
}
/**
@@ -735,7 +737,7 @@
{
u8 partial_ack;
- if (tcp_in_cong_recovery (tc))
+ if (tcp_in_fastrecovery (tc))
{
partial_ack = seq_lt (tc->snd_una, tc->snd_congestion);
if (!partial_ack)
@@ -749,6 +751,7 @@
/* Clear retransmitted bytes. XXX should we clear all? */
tc->rtx_bytes = 0;
+
tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK);
/* In case snd_nxt is still in the past and output tries to
@@ -772,6 +775,13 @@
tc->cc_algo->rcv_ack (tc);
tc->tsecr_last_ack = tc->opt.tsecr;
tc->rcv_dupacks = 0;
+ if (tcp_in_recovery (tc))
+ {
+ tc->rtx_bytes -= clib_min (tc->bytes_acked, tc->rtx_bytes);
+ tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX);
+ if (seq_geq (tc->snd_una, tc->snd_congestion))
+ tcp_recovery_off (tc);
+ }
}
}
@@ -897,7 +907,7 @@
tcp_cc_rcv_ack (tc, b);
/* If everything has been acked, stop retransmit timer
- * otherwise update */
+ * otherwise update. */
if (tc->snd_una == tc->snd_una_max)
tcp_retransmit_timer_reset (tc);
else
@@ -1778,6 +1788,11 @@
tcp_send_reset (b0, is_ip4);
goto drop;
}
+
+ /* Update rtt and rto */
+ tc0->bytes_acked = 1;
+ tcp_update_rtt (tc0, vnet_buffer (b0)->tcp.ack_number);
+
/* Switch state to ESTABLISHED */
tc0->state = TCP_STATE_ESTABLISHED;
diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c
index 856dffe..3525f4e 100644
--- a/src/vnet/tcp/tcp_newreno.c
+++ b/src/vnet/tcp/tcp_newreno.c
@@ -38,7 +38,7 @@
else
{
/* Round up to 1 if needed */
- tc->cwnd += clib_max (tc->snd_mss * tc->snd_mss / tc->cwnd, 1);
+ tc->cwnd += clib_max ((tc->snd_mss * tc->snd_mss) / tc->cwnd, 1);
}
}
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index a85d30d..7ee930c 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -73,7 +73,7 @@
snd_mss = dummy_mtu;
/* TODO cache mss and consider PMTU discovery */
- snd_mss = tc->opt.mss < snd_mss ? tc->opt.mss : snd_mss;
+ snd_mss = clib_min (tc->opt.mss, snd_mss);
tc->snd_mss = snd_mss;
@@ -923,6 +923,12 @@
/* TODO this is updated in output as well ... */
if (tc->snd_nxt > tc->snd_una_max)
tc->snd_una_max = tc->snd_nxt;
+
+ if (tc->rtt_ts == 0)
+ {
+ tc->rtt_ts = tcp_time_now ();
+ tc->rtt_seq = tc->snd_nxt;
+ }
TCP_EVT_DBG (TCP_EVT_PKTIZE, tc);
}
@@ -1019,9 +1025,10 @@
}
/* Start again from the beginning */
- tcp_recovery_on (tc);
+
tc->cwnd = tcp_loss_wnd (tc);
tc->snd_congestion = tc->snd_una_max;
+ tcp_recovery_on (tc);
}
static void
@@ -1032,7 +1039,7 @@
u32 thread_index = vlib_get_thread_index ();
tcp_connection_t *tc;
vlib_buffer_t *b;
- u32 bi, snd_space, n_bytes;
+ u32 bi, n_bytes;
if (is_syn)
{
@@ -1065,34 +1072,17 @@
/* Exponential backoff */
tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
- /* Figure out what and how many bytes we can send */
- snd_space = tcp_available_snd_space (tc);
-
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1);
- if (snd_space == 0)
+ /* Send one segment. No fancy recovery for now! */
+ n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss);
+ scoreboard_clear (&tc->sack_sb);
+
+ if (n_bytes == 0)
{
- clib_warning ("no wnd to retransmit");
- tcp_return_buffer (tm);
-
- /* Force one segment */
- tcp_retransmit_first_unacked (tc);
-
- /* Re-enable retransmit timer. Output may be unwilling
- * to do it for us */
- tcp_retransmit_timer_set (tc);
-
+ clib_warning ("could not retransmit");
return;
}
- else
- {
- /* No fancy recovery for now! */
- n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, snd_space);
- scoreboard_clear (&tc->sack_sb);
-
- if (n_bytes == 0)
- return;
- }
}
else
{
@@ -1400,7 +1390,7 @@
}
/* If not retransmitting
- * 1) update snd_una_max (SYN, SYNACK, new data, FIN)
+ * 1) update snd_una_max (SYN, SYNACK, FIN)
* 2) If we're not tracking an ACK, start tracking */
if (seq_lt (tc0->snd_una_max, tc0->snd_nxt))
{