TCP cc/window management fixes and debugging
- added persist timer
- update rcv_las whenever sending an ack
- moved fifo size to its own cache line
- improved session and builtin client debugging
Change-Id: Ia649cf942cf0c061a713e8b67f0eb6974a6cd55b
Signed-off-by: Florin Coras <fcoras@cisco.com>
Signed-off-by: Dave Barach <dave@barachs.net>
diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c
index 07b0d2d..cc84feb 100644
--- a/src/svm/svm_fifo.c
+++ b/src/svm/svm_fifo.c
@@ -254,6 +254,10 @@
{
ooo_segment_t *s;
u32 index, bytes = 0, diff;
+ u32 cursize;
+
+ /* read cursize, which can only increase while we're working */
+ cursize = svm_fifo_max_dequeue (f);
s = pool_elt_at_index (f->ooo_segments, f->ooos_list_head);
@@ -286,8 +290,8 @@
/* If tail is adjacent to an ooo segment, 'consume' it */
if (diff == 0)
{
- bytes = ((f->nitems - f->cursize) >= s->length) ? s->length :
- f->nitems - f->cursize;
+ bytes = ((f->nitems - cursize) >= s->length) ? s->length :
+ f->nitems - cursize;
f->tail += bytes;
f->tail %= f->nitems;
@@ -305,11 +309,12 @@
u32 total_copy_bytes, first_copy_bytes, second_copy_bytes;
u32 cursize, nitems;
- if (PREDICT_FALSE (f->cursize == f->nitems))
+ /* read cursize, which can only increase while we're working */
+ cursize = svm_fifo_max_dequeue (f);
+
+ if (PREDICT_FALSE (cursize == f->nitems))
return -2; /* fifo stuffed */
- /* read cursize, which can only decrease while we're working */
- cursize = f->cursize;
nitems = f->nitems;
/* Number of bytes we're going to copy */
@@ -382,8 +387,8 @@
ASSERT (offset > 0);
- /* read cursize, which can only decrease while we're working */
- cursize = f->cursize;
+ /* read cursize, which can only increase while we're working */
+ cursize = svm_fifo_max_dequeue (f);
nitems = f->nitems;
/* Will this request fit? */
@@ -437,11 +442,11 @@
u32 total_copy_bytes, first_copy_bytes, second_copy_bytes;
u32 cursize, nitems;
- if (PREDICT_FALSE (f->cursize == 0))
+ /* read cursize, which can only increase while we're working */
+ cursize = svm_fifo_max_dequeue (f);
+ if (PREDICT_FALSE (cursize == 0))
return -2; /* nothing in the fifo */
- /* read cursize, which can only increase while we're working */
- cursize = f->cursize;
nitems = f->nitems;
/* Number of bytes we're going to copy */
@@ -495,11 +500,11 @@
u32 total_copy_bytes, first_copy_bytes, second_copy_bytes;
u32 cursize, nitems, real_head;
- if (PREDICT_FALSE (f->cursize == 0))
+ /* read cursize, which can only increase while we're working */
+ cursize = svm_fifo_max_dequeue (f);
+ if (PREDICT_FALSE (cursize == 0))
return -2; /* nothing in the fifo */
- /* read cursize, which can only increase while we're working */
- cursize = f->cursize;
nitems = f->nitems;
real_head = f->head + offset;
real_head = real_head >= nitems ? real_head - nitems : real_head;
@@ -532,11 +537,11 @@
u32 total_drop_bytes, first_drop_bytes, second_drop_bytes;
u32 cursize, nitems;
- if (PREDICT_FALSE (f->cursize == 0))
+ /* read cursize, which can only increase while we're working */
+ cursize = svm_fifo_max_dequeue (f);
+ if (PREDICT_FALSE (cursize == 0))
return -2; /* nothing in the fifo */
- /* read cursize, which can only increase while we're working */
- cursize = f->cursize;
nitems = f->nitems;
/* Number of bytes we're going to drop */
diff --git a/src/svm/svm_fifo.h b/src/svm/svm_fifo.h
index 3955617..80e5b0f 100644
--- a/src/svm/svm_fifo.h
+++ b/src/svm/svm_fifo.h
@@ -44,14 +44,16 @@
typedef struct
{
+ volatile u32 cursize; /**< current fifo size */
+ u32 nitems;
+ CLIB_CACHE_LINE_ALIGN_MARK (end_cursize);
+
pthread_mutex_t mutex; /* 8 bytes */
pthread_cond_t condvar; /* 8 bytes */
svm_lock_tag_t tag;
- volatile u32 cursize; /**< current fifo size */
volatile u8 has_event; /**< non-zero if deq event exists */
u32 owner_pid;
- u32 nitems;
/* Backpointers */
u32 server_session_index;
@@ -105,7 +107,7 @@
static inline u32
svm_fifo_max_enqueue (svm_fifo_t * f)
{
- return f->nitems - f->cursize;
+ return f->nitems - svm_fifo_max_dequeue (f);
}
static inline u8
diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c
index 8681105..b86e87d 100644
--- a/src/vnet/session/node.c
+++ b/src/vnet/session/node.c
@@ -119,15 +119,20 @@
/* Nothing to read return */
if (max_dequeue0 == 0)
- {
- return 0;
- }
+ return 0;
/* Ensure we're not writing more than transport window allows */
- max_len_to_snd0 = clib_min (max_dequeue0, snd_space0);
-
- /* TODO check if transport is willing to send len_to_snd0
- * bytes (Nagle) */
+ if (max_dequeue0 < snd_space0)
+ {
+ /* Constrained by tx queue. Try to send only fully formed segments */
+ max_len_to_snd0 = (max_dequeue0 > snd_mss0) ?
+ max_dequeue0 - max_dequeue0 % snd_mss0 : max_dequeue0;
+ /* TODO Nagle ? */
+ }
+ else
+ {
+ max_len_to_snd0 = snd_space0;
+ }
n_frame_bytes = snd_mss0 * VLIB_FRAME_SIZE;
n_frames_per_evt = ceil ((double) max_len_to_snd0 / n_frame_bytes);
@@ -308,11 +313,14 @@
int n_tx_packets = 0;
u32 my_thread_index = vm->cpu_index;
int i, rv;
+ f64 now = vlib_time_now (vm);
+
+ SESSION_EVT_DBG (SESSION_EVT_POLL_GAP_TRACK, smm, my_thread_index);
/*
* Update TCP time
*/
- tcp_update_time (vlib_time_now (vm), my_thread_index);
+ tcp_update_time (now, my_thread_index);
/*
* Get vpp queue events
diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c
index f10918a..8e2b261 100644
--- a/src/vnet/session/session.c
+++ b/src/vnet/session/session.c
@@ -556,7 +556,7 @@
u8 * added_a_segment)
{
svm_fifo_segment_private_t *fifo_segment;
- u32 fifo_size, default_fifo_size = 128 << 10; /* TODO config */
+ u32 fifo_size, default_fifo_size = 1 << 16; /* TODO config */
int i;
*added_a_segment = 0;
@@ -1293,6 +1293,10 @@
vec_validate (smm->current_enqueue_epoch, num_threads - 1);
vec_validate (smm->vpp_event_queues, num_threads - 1);
+#if SESSION_DBG
+ vec_validate (smm->last_event_poll_by_thread, num_threads - 1);
+#endif
+
/* $$$$ preallocate hack config parameter */
for (i = 0; i < 200000; i++)
{
diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h
index a39bc06..6878b4d 100644
--- a/src/vnet/session/session.h
+++ b/src/vnet/session/session.h
@@ -20,6 +20,7 @@
#include <vlibmemory/api.h>
#include <vppinfra/sparse_vec.h>
#include <svm/svm_fifo_segment.h>
+#include <vnet/session/session_debug.h>
#define HALF_OPEN_LOOKUP_INVALID_VALUE ((u64)~0)
#define INVALID_INDEX ((u32)~0)
@@ -36,7 +37,7 @@
FIFO_EVENT_BUILTIN_RX
} fifo_event_type_t;
-#define foreach_session_input_error \
+#define foreach_session_input_error \
_(NO_SESSION, "No session drops") \
_(NO_LISTENER, "No listener for dst port drops") \
_(ENQUEUED, "Packets pushed into rx fifo") \
@@ -218,6 +219,15 @@
/* Convenience */
vlib_main_t *vlib_main;
vnet_main_t *vnet_main;
+
+#if SESSION_DBG
+ /**
+ * last event poll time by thread
+ * Debug only. Will cause false cache-line sharing as-is
+ */
+ f64 *last_event_poll_by_thread;
+#endif
+
};
extern session_manager_main_t session_manager_main;
diff --git a/src/vnet/session/session_debug.h b/src/vnet/session/session_debug.h
index 80a97cd..eb11f1a 100644
--- a/src/vnet/session/session_debug.h
+++ b/src/vnet/session/session_debug.h
@@ -16,13 +16,13 @@
#define SRC_VNET_SESSION_SESSION_DEBUG_H_
#include <vnet/session/transport.h>
-#include <vnet/session/session.h>
#include <vlib/vlib.h>
#define foreach_session_dbg_evt \
_(ENQ, "enqueue") \
_(DEQ, "dequeue") \
- _(DEQ_NODE, "dequeue")
+ _(DEQ_NODE, "dequeue") \
+ _(POLL_GAP_TRACK, "poll gap track") \
typedef enum _session_evt_dbg
{
@@ -33,6 +33,7 @@
#define SESSION_DBG (0)
#define SESSION_DEQ_NODE_EVTS (0)
+#define SESSION_EVT_POLL_DBG (1)
#if TRANSPORT_DEBUG && SESSION_DBG
@@ -97,9 +98,34 @@
#define SESSION_EVT_DEQ_NODE_HANDLER(_node_evt)
#endif
+#if SESSION_DBG && SESSION_EVT_POLL_DBG
+#define SESSION_EVT_POLL_GAP(_smm, _my_thread_index) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "nixon-gap: %d MS", \
+ .format_args = "i4", \
+ }; \
+ DEC_SESSION_ED(_e, 1); \
+ ed->data[0] = (u32) ((now - \
+ _smm->last_event_poll_by_thread[my_thread_index])*1000.0); \
+}
+#define SESSION_EVT_POLL_GAP_TRACK_HANDLER(_smm, _my_thread_index) \
+{ \
+ if (PREDICT_TRUE( \
+ smm->last_event_poll_by_thread[my_thread_index] != 0.0)) \
+ if (now > smm->last_event_poll_by_thread[_my_thread_index] + 500e-6)\
+ SESSION_EVT_POLL_GAP(smm, my_thread_index); \
+ _smm->last_event_poll_by_thread[my_thread_index] = now; \
+}
+
+#else
+#define SESSION_EVT_POLL_GAP(_smm, _my_thread_index)
+#define SESSION_EVT_POLL_GAP_TRACK_HANDLER(_smm, _my_thread_index)
+#endif
+
#define CONCAT_HELPER(_a, _b) _a##_b
#define CC(_a, _b) CONCAT_HELPER(_a, _b)
-
#define SESSION_EVT_DBG(_evt, _args...) CC(_evt, _HANDLER)(_args)
#else
diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c
index 83cdbc1..e370506 100644
--- a/src/vnet/tcp/builtin_client.c
+++ b/src/vnet/tcp/builtin_client.c
@@ -43,6 +43,10 @@
#include <vpp/api/vpe_all_api_h.h>
#undef vl_printfun
+#define TCP_BUILTIN_CLIENT_DBG (1)
+#define TCP_BUILTIN_CLIENT_VPP_THREAD (0)
+#define TCP_BUILTIN_CLIENT_PTHREAD (!TCP_BUILTIN_CLIENT_VPP_THREAD)
+
static void
send_test_chunk (tclient_main_t * tm, session_t * s)
{
@@ -52,35 +56,50 @@
session_fifo_event_t evt;
static int serial_number = 0;
int rv;
+ test_buf_offset = s->bytes_sent % vec_len (test_data);
+ bytes_this_chunk = vec_len (test_data) - test_buf_offset;
- while (s->bytes_to_send > 0)
+ bytes_this_chunk = bytes_this_chunk < s->bytes_to_send
+ ? bytes_this_chunk : s->bytes_to_send;
+
+ rv = svm_fifo_enqueue_nowait (s->server_tx_fifo, 0 /*pid */ ,
+ bytes_this_chunk,
+ test_data + test_buf_offset);
+
+ /* If we managed to enqueue data... */
+ if (rv > 0)
{
-
- test_buf_offset = s->bytes_sent % vec_len (test_data);
- bytes_this_chunk = vec_len (test_data) - test_buf_offset;
-
- bytes_this_chunk = bytes_this_chunk < s->bytes_to_send
- ? bytes_this_chunk : s->bytes_to_send;
-
- rv = svm_fifo_enqueue_nowait (s->server_tx_fifo, 0 /*pid */ ,
- bytes_this_chunk,
- test_data + test_buf_offset);
-
- if (rv > 0)
+ if (TCP_BUILTIN_CLIENT_DBG)
{
- s->bytes_to_send -= rv;
- s->bytes_sent += rv;
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "tx-enq: %d bytes",
+ .format_args = "i4",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 data[1];
+ } *ed;
+ ed = ELOG_DATA (&vlib_global_main.elog_main, e);
+ ed->data[0] = rv;
+ }
- if (svm_fifo_set_event (s->server_tx_fifo))
- {
- /* Fabricate TX event, send to vpp */
- evt.fifo = s->server_tx_fifo;
- evt.event_type = FIFO_EVENT_SERVER_TX;
- evt.event_id = serial_number++;
+ /* Account for it... */
+ s->bytes_to_send -= rv;
+ s->bytes_sent += rv;
- unix_shared_memory_queue_add (tm->vpp_event_queue, (u8 *) & evt,
- 0 /* do wait for mutex */ );
- }
+ /* Poke the TCP state machine */
+ if (svm_fifo_set_event (s->server_tx_fifo))
+ {
+ /* Fabricate TX event, send to vpp */
+ evt.fifo = s->server_tx_fifo;
+ evt.event_type = FIFO_EVENT_SERVER_TX;
+ evt.event_id = serial_number++;
+
+ unix_shared_memory_queue_add (tm->vpp_event_queue, (u8 *) & evt,
+ 0 /* do wait for mutex */ );
}
}
}
@@ -89,39 +108,55 @@
receive_test_chunk (tclient_main_t * tm, session_t * s)
{
svm_fifo_t *rx_fifo = s->server_rx_fifo;
- int n_read, bytes, i;
+ int n_read, test_bytes = 0;
- bytes = svm_fifo_max_dequeue (rx_fifo);
/* Allow enqueuing of new event */
- svm_fifo_unset_event (rx_fifo);
+ // svm_fifo_unset_event (rx_fifo);
- /* Read the bytes */
- do
+ n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len (tm->rx_buf),
+ tm->rx_buf);
+ if (n_read > 0)
{
- n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len (tm->rx_buf),
- tm->rx_buf);
- if (n_read > 0)
+ if (TCP_BUILTIN_CLIENT_DBG)
{
- bytes -= n_read;
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "rx-deq: %d bytes",
+ .format_args = "i4",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 data[1];
+ } *ed;
+ ed = ELOG_DATA (&vlib_global_main.elog_main, e);
+ ed->data[0] = n_read;
+ }
+
+ if (test_bytes)
+ {
+ int i;
for (i = 0; i < n_read; i++)
{
if (tm->rx_buf[i] != ((s->bytes_received + i) & 0xff))
{
clib_warning ("read %d error at byte %lld, 0x%x not 0x%x",
- n_read, s->bytes_received + i,
- tm->rx_buf[i],
+ n_read, s->bytes_received + i, tm->rx_buf[i],
((s->bytes_received + i) & 0xff));
}
}
- s->bytes_to_receive -= n_read;
- s->bytes_received += n_read;
}
-
+ s->bytes_to_receive -= n_read;
+ s->bytes_received += n_read;
}
- while (n_read < 0 || bytes > 0);
}
+#if TCP_BUILTIN_CLIENT_VPP_THREAD
+static void
+#else
static void *
+#endif
tclient_thread_fn (void *arg)
{
tclient_main_t *tm = &tclient_main;
@@ -139,6 +174,8 @@
pthread_sigmask (SIG_SETMASK, &s, 0);
}
+ clib_per_cpu_mheaps[os_get_cpu_number ()] = clib_per_cpu_mheaps[0];
+
while (1)
{
/* Wait until we're told to get busy */
@@ -186,12 +223,12 @@
/* Disconnect sessions... */
vec_reset_length (session_indices);
- pool_foreach (sp, tm->sessions, (
- {
- vec_add1 (session_indices,
- sp - tm->sessions);
- }
- ));
+
+ /* *INDENT-OFF* */
+ pool_foreach (sp, tm->sessions, ({
+ vec_add1 (session_indices, sp - tm->sessions);
+ }));
+ /* *INDENT-ON* */
for (i = 0; i < vec_len (session_indices); i++)
{
@@ -207,7 +244,9 @@
}
}
/* NOTREACHED */
+#if TCP_BUILTIN_CLIENT_PTHREAD
return 0;
+#endif
}
/* So we don't get "no handler for... " msgs */
@@ -333,7 +372,7 @@
unformat_input_t * input,
vlib_cli_command_t * cmd)
{
- u8 *connect_uri = (u8 *) "tcp://6.0.1.2/1234";
+ u8 *connect_uri = (u8 *) "tcp://6.0.1.1/1234";
u8 *uri;
tclient_main_t *tm = &tclient_main;
int i;
@@ -349,7 +388,7 @@
;
else if (unformat (input, "iterations %d", &tm->n_iterations))
;
- else if (unformat (input, "bytes %d", &tm->bytes_to_send))
+ else if (unformat (input, "bytes %lld", &tm->bytes_to_send))
;
else if (unformat (input, "uri %s", &tm->connect_uri))
;
@@ -366,17 +405,20 @@
create_api_loopback (tm);
+#if TCP_BUILTIN_CLIENT_PTHREAD
/* Start a transmit thread */
if (tm->client_thread_handle == 0)
{
int rv = pthread_create (&tm->client_thread_handle,
- NULL /*attr */ , tclient_thread_fn, 0);
+ NULL /*attr */ ,
+ tclient_thread_fn, 0);
if (rv)
{
tm->client_thread_handle = 0;
return clib_error_return (0, "pthread_create returned %d", rv);
}
}
+#endif
/* Fire off connect requests, in something approaching a normal manner */
for (i = 0; i < n_clients; i++)
@@ -397,6 +439,18 @@
return 0;
}
+#if TCP_BUILTIN_CLIENT_VPP_THREAD
+/* *INDENT-OFF* */
+VLIB_REGISTER_THREAD (builtin_client_reg, static) = {
+ .name = "tcp-builtin-client",
+ .function = tclient_thread_fn,
+ .fixed_count = 1,
+ .count = 1,
+ .no_data_structure_clone = 1,
+};
+/* *INDENT-ON* */
+#endif
+
/* *INDENT-OFF* */
VLIB_CLI_COMMAND (test_clients_command, static) =
{
diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c
index efd26e9..917d4bd 100644
--- a/src/vnet/tcp/builtin_server.c
+++ b/src/vnet/tcp/builtin_server.c
@@ -127,6 +127,7 @@
{
/* XXX timeout for session that are stuck */
+ rx_event:
/* Program self-tap to retry */
if (svm_fifo_set_event (rx_fifo))
{
@@ -158,7 +159,9 @@
n_written =
svm_fifo_enqueue_nowait (tx_fifo, 0, actual_transfer, bsm->rx_buf);
- ASSERT (n_written == max_transfer);
+
+ if (n_written != max_transfer)
+ clib_warning ("short trout!");
if (svm_fifo_set_event (tx_fifo))
{
@@ -171,6 +174,9 @@
(u8 *) & evt, 0 /* do wait for mutex */ );
}
+ if (PREDICT_FALSE (max_enqueue < max_dequeue))
+ goto rx_event;
+
return 0;
}
@@ -204,8 +210,8 @@
a->session_cb_vft = &builtin_session_cb_vft;
a->options = options;
a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 128 << 20;
- a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 64 << 10;
- a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 64 << 10;
+ a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 1 << 16;
+ a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 1 << 16;
a->segment_name = segment_name;
a->segment_name_length = ARRAY_LEN (segment_name);
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
index c3df5bc..b2a371e 100644
--- a/src/vnet/tcp/tcp.c
+++ b/src/vnet/tcp/tcp.c
@@ -578,7 +578,9 @@
/* If we can't write at least a segment, don't try at all */
if (snd_space < tc->snd_mss)
return 0;
- return snd_space;
+
+ /* round down to mss multiple */
+ return snd_space - (snd_space % tc->snd_mss);
}
/* If in fast recovery, send 1 SMSS if wnd allows */
@@ -706,7 +708,7 @@
{
tcp_timer_retransmit_handler,
tcp_timer_delack_handler,
- 0,
+ tcp_timer_persist_handler,
tcp_timer_keep_handler,
tcp_timer_waitclose_handler,
tcp_timer_retransmit_syn_handler,
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index b4286bc..2f5da10 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -81,6 +81,7 @@
extern timer_expiration_handler tcp_timer_delack_handler;
extern timer_expiration_handler tcp_timer_retransmit_handler;
+extern timer_expiration_handler tcp_timer_persist_handler;
extern timer_expiration_handler tcp_timer_retransmit_syn_handler;
#define TCP_TIMER_HANDLE_INVALID ((u32) ~0)
@@ -253,13 +254,25 @@
#define tcp_fastrecovery_on(tc) (tc)->flags |= TCP_CONN_FAST_RECOVERY
#define tcp_fastrecovery_off(tc) (tc)->flags &= ~TCP_CONN_FAST_RECOVERY
+#define tcp_recovery_on(tc) (tc)->flags |= TCP_CONN_RECOVERY
+#define tcp_recovery_off(tc) (tc)->flags &= ~TCP_CONN_RECOVERY
#define tcp_in_fastrecovery(tc) ((tc)->flags & TCP_CONN_FAST_RECOVERY)
-#define tcp_in_recovery(tc) ((tc)->flags & (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY))
+#define tcp_in_recovery(tc) ((tc)->flags & (TCP_CONN_RECOVERY))
#define tcp_in_slowstart(tc) (tc->cwnd < tc->ssthresh)
#define tcp_fastrecovery_sent_1_smss(tc) ((tc)->flags & TCP_CONN_FR_1_SMSS)
#define tcp_fastrecovery_1_smss_on(tc) ((tc)->flags |= TCP_CONN_FR_1_SMSS)
#define tcp_fastrecovery_1_smss_off(tc) ((tc)->flags &= ~TCP_CONN_FR_1_SMSS)
+#define tcp_in_cong_recovery(tc) ((tc)->flags & \
+ (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY))
+
+always_inline void
+tcp_cong_recovery_off (tcp_connection_t * tc)
+{
+ tc->flags &= ~(TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY);
+ tcp_fastrecovery_1_smss_off (tc);
+}
+
typedef enum
{
TCP_IP4,
@@ -538,6 +551,27 @@
tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT);
}
+always_inline void
+tcp_persist_timer_set (tcp_connection_t * tc)
+{
+ /* Reuse RTO. It's backed off in handler */
+ tcp_timer_set (tc, TCP_TIMER_PERSIST,
+ clib_max (tc->rto * TCP_TO_TIMER_TICK, 1));
+}
+
+always_inline void
+tcp_persist_timer_update (tcp_connection_t * tc)
+{
+ tcp_timer_update (tc, TCP_TIMER_PERSIST,
+ clib_max (tc->rto * TCP_TO_TIMER_TICK, 1));
+}
+
+always_inline void
+tcp_persist_timer_reset (tcp_connection_t * tc)
+{
+ tcp_timer_reset (tc, TCP_TIMER_PERSIST);
+}
+
always_inline u8
tcp_timer_is_active (tcp_connection_t * tc, tcp_timers_e timer)
{
diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h
index 5a71694..0090e15 100644
--- a/src/vnet/tcp/tcp_debug.h
+++ b/src/vnet/tcp/tcp_debug.h
@@ -31,6 +31,7 @@
_(UNBIND, "unbind") \
_(DELETE, "delete") \
_(SYN_SENT, "SYN sent") \
+ _(SYN_RTX, "SYN retransmit") \
_(FIN_SENT, "FIN sent") \
_(ACK_SENT, "ACK sent") \
_(DUPACK_SENT, "DUPACK sent") \
@@ -50,6 +51,7 @@
_(CC_PACK, "cc partial ack") \
_(SEG_INVALID, "invalid segment") \
_(ACK_RCV_ERR, "invalid ack") \
+ _(RCV_WND_SHRUNK, "shrunk rcv_wnd") \
typedef enum _tcp_dbg
{
@@ -159,35 +161,48 @@
{ \
ELOG_TYPE_DECLARE (_e) = \
{ \
- .format = "ack_prep: acked %u rcv_nxt %u rcv_wnd %u snd_nxt %u", \
- .format_args = "i4i4i4i4", \
+ .format = "ack_tx: acked %u rcv_nxt %u rcv_wnd %u snd_nxt %u snd_wnd %u",\
+ .format_args = "i4i4i4i4i4", \
}; \
- DECLARE_ETD(_tc, _e, 4); \
+ DECLARE_ETD(_tc, _e, 5); \
ed->data[0] = _tc->rcv_nxt - _tc->rcv_las; \
ed->data[1] = _tc->rcv_nxt - _tc->irs; \
ed->data[2] = _tc->rcv_wnd; \
ed->data[3] = _tc->snd_nxt - _tc->iss; \
+ ed->data[4] = _tc->snd_wnd; \
}
#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...) \
{ \
ELOG_TYPE_DECLARE (_e) = \
{ \
- .format = "dack_tx: rcv_nxt %u rcv_wnd %u snd_nxt %u av-wnd %u", \
- .format_args = "i4i4i4i4", \
+ .format = "dack_tx: rcv_nxt %u rcv_wnd %u snd_nxt %u av_wnd %u snd_wnd %u",\
+ .format_args = "i4i4i4i4i4", \
}; \
- DECLARE_ETD(_tc, _e, 4); \
+ DECLARE_ETD(_tc, _e, 5); \
ed->data[0] = _tc->rcv_nxt - _tc->irs; \
ed->data[1] = _tc->rcv_wnd; \
ed->data[2] = _tc->snd_nxt - _tc->iss; \
ed->data[3] = tcp_available_wnd(_tc); \
+ ed->data[4] = _tc->snd_wnd; \
}
#define TCP_EVT_SYN_SENT_HANDLER(_tc, ...) \
{ \
ELOG_TYPE_DECLARE (_e) = \
{ \
- .format = "SYNtx: iss %u", \
+ .format = "SYNtx: iss %u", \
+ .format_args = "i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 1); \
+ ed->data[0] = _tc->iss; \
+}
+
+#define TCP_EVT_SYN_RTX_HANDLER(_tc, ...) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "SYNrtx: iss %u", \
.format_args = "i4", \
}; \
DECLARE_ETD(_tc, _e, 1); \
@@ -254,17 +269,17 @@
ed->data[1] = _tc->rcv_nxt - _tc->irs; \
}
-#define TCP_EVT_ACK_RCVD_HANDLER(_tc, _ack, ...) \
+#define TCP_EVT_ACK_RCVD_HANDLER(_tc, ...) \
{ \
ELOG_TYPE_DECLARE (_e) = \
{ \
- .format = "acked: %u snd_una %u ack %u cwnd %u inflight %u", \
+ .format = "acked: %u snd_una %u snd_wnd %u cwnd %u inflight %u", \
.format_args = "i4i4i4i4i4", \
}; \
DECLARE_ETD(_tc, _e, 5); \
ed->data[0] = _tc->bytes_acked; \
ed->data[1] = _tc->snd_una - _tc->iss; \
- ed->data[2] = _ack - _tc->iss; \
+ ed->data[2] = _tc->snd_wnd; \
ed->data[3] = _tc->cwnd; \
ed->data[4] = tcp_flight_size(_tc); \
}
@@ -273,14 +288,15 @@
{ \
ELOG_TYPE_DECLARE (_e) = \
{ \
- .format = "dack_rx: snd_una %u cwnd %u snd_wnd %u inflight %u", \
- .format_args = "i4i4i4i4", \
+ .format = "dack_rx: snd_una %u cwnd %u snd_wnd %u flight %u rcv_wnd %u",\
+ .format_args = "i4i4i4i4i4", \
}; \
- DECLARE_ETD(_tc, _e, 4); \
+ DECLARE_ETD(_tc, _e, 5); \
ed->data[0] = _tc->snd_una - _tc->iss; \
ed->data[1] = _tc->cwnd; \
ed->data[2] = _tc->snd_wnd; \
ed->data[3] = tcp_flight_size(_tc); \
+ ed->data[4] = _tc->rcv_wnd; \
}
#define TCP_EVT_PKTIZE_HANDLER(_tc, ...) \
@@ -302,7 +318,7 @@
{ \
ELOG_TYPE_DECLARE (_e) = \
{ \
- .format = "in: %s len %u written %d rcv_nxt %u free wnd %d", \
+ .format = "in: %s len %u written %d rcv_nxt %u rcv_wnd(o) %d", \
.format_args = "t4i4i4i4i4", \
.n_enum_strings = 2, \
.enum_strings = { \
@@ -338,7 +354,7 @@
.enum_strings = { \
"retransmit", \
"delack", \
- "BUG", \
+ "persist", \
"keep", \
"waitclose", \
"retransmit syn", \
@@ -354,7 +370,7 @@
{ \
ELOG_TYPE_DECLARE (_e) = \
{ \
- .format = "seg-inv: seq %u end %u rcv_las %u rcv_nxt %u wnd %u", \
+ .format = "seg-inv: seq %u end %u rcv_las %u rcv_nxt %u rcv_wnd %u",\
.format_args = "i4i4i4i4i4", \
}; \
DECLARE_ETD(_tc, _e, 5); \
@@ -445,6 +461,24 @@
#define TCP_EVT_CC_PACK_HANDLER(_tc, ...)
#endif
+#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...) \
+{ \
+if (_av > 0) \
+{ \
+ ELOG_TYPE_DECLARE (_e) = \
+ { \
+ .format = "huh?: rcv_wnd %u obsd %u av %u rcv_nxt %u rcv_las %u", \
+ .format_args = "i4i4i4i4i4", \
+ }; \
+ DECLARE_ETD(_tc, _e, 5); \
+ ed->data[0] = _tc->rcv_wnd; \
+ ed->data[1] = _obs; \
+ ed->data[2] = _av; \
+ ed->data[3] = _tc->rcv_nxt - _tc->irs; \
+ ed->data[4] = _tc->rcv_las - _tc->irs; \
+} \
+}
+
#if TCP_DBG_VERBOSE
#define TCP_EVT_SND_WND_HANDLER(_tc, ...) \
{ \
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index 5d11985f..a8224dc 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -276,8 +276,7 @@
if (tc0->rcv_wnd == 0
&& tc0->rcv_nxt == vnet_buffer (b0)->tcp.seq_number)
{
- /* Make it look as if there's nothing to dequeue */
- vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number;
+ /* TODO Should segment be tagged? */
}
else
{
@@ -375,7 +374,6 @@
if (tc->rtt_seq && seq_gt (ack, tc->rtt_seq) && !tc->rto_boff)
{
mrtt = tcp_time_now () - tc->rtt_ts;
- tc->rtt_seq = 0;
}
/* As per RFC7323 TSecr can be used for RTTM only if the segment advances
@@ -395,6 +393,10 @@
tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX);
+ /* Allow measuring of RTT and make sure boff is 0 */
+ tc->rtt_seq = 0;
+ tc->rto_boff = 0;
+
return 1;
}
@@ -408,11 +410,7 @@
stream_session_dequeue_drop (&tc->connection, tc->bytes_acked);
/* Update rtt and rto */
- if (tcp_update_rtt (tc, ack))
- {
- /* Good ACK received and valid RTT, make sure retransmit backoff is 0 */
- tc->rto_boff = 0;
- }
+ tcp_update_rtt (tc, ack);
}
/**
@@ -672,6 +670,13 @@
tc->snd_wl1 = seq;
tc->snd_wl2 = ack;
TCP_EVT_DBG (TCP_EVT_SND_WND, tc);
+
+ /* Set probe timer if we just got 0 wnd */
+ if (tc->snd_wnd < tc->snd_mss
+ && !tcp_timer_is_active (tc, TCP_TIMER_PERSIST))
+ tcp_persist_timer_set (tc);
+ else
+ tcp_persist_timer_reset (tc);
}
}
@@ -686,6 +691,10 @@
void
tcp_cc_recover (tcp_connection_t * tc)
{
+ /* TODO: check if time to recover was small. It might be that RTO popped
+ * too soon.
+ */
+
tc->cc_algo->recovered (tc);
tc->rtx_bytes = 0;
@@ -695,8 +704,7 @@
tc->cc_algo->rcv_ack (tc);
tc->tsecr_last_ack = tc->opt.tsecr;
- tcp_fastrecovery_1_smss_off (tc);
- tcp_fastrecovery_off (tc);
+ tcp_cong_recovery_off (tc);
TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3);
}
@@ -706,7 +714,7 @@
{
u8 partial_ack;
- if (tcp_in_recovery (tc))
+ if (tcp_in_cong_recovery (tc))
{
partial_ack = seq_lt (tc->snd_una, tc->snd_congestion);
if (!partial_ack)
@@ -724,10 +732,10 @@
/* In case snd_nxt is still in the past and output tries to
* shove some new bytes */
- tc->snd_nxt = tc->snd_una;
+ tc->snd_nxt = tc->snd_una_max;
/* XXX need proper RFC6675 support */
- if (tc->sack_sb.last_sacked_bytes)
+ if (tc->sack_sb.last_sacked_bytes && !tcp_in_recovery (tc))
{
tcp_fast_retransmit (tc);
}
@@ -735,9 +743,6 @@
{
/* Retransmit first unacked segment */
tcp_retransmit_first_unacked (tc);
- /* If window allows, send 1 SMSS of new data */
- if (seq_lt (tc->snd_nxt, tc->snd_congestion))
- tc->snd_nxt = tc->snd_congestion;
}
}
}
@@ -814,10 +819,11 @@
return -1;
}
- tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
- *error = TCP_ERROR_ACK_FUTURE;
TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 2,
vnet_buffer (b)->tcp.ack_number);
+
+ tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
+ *error = TCP_ERROR_ACK_FUTURE;
}
/* If old ACK, probably it's an old dupack */
@@ -863,7 +869,7 @@
* timer. */
if (tc->bytes_acked)
{
- TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc, vnet_buffer (b)->tcp.ack_number);
+ TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc);
/* Updates congestion control (slow start/congestion avoidance) */
tcp_cc_rcv_ack (tc, b);
@@ -966,11 +972,14 @@
tc->rcv_nxt += written;
/* Depending on how fast the app is, all remaining buffers in burst will
- * not be enqueued. Should we inform peer of the damage? XXX */
+ * not be enqueued. Inform peer */
+ tc->flags |= TCP_CONN_SNDACK;
+
return TCP_ERROR_PARTIALLY_ENQUEUED;
}
else
{
+ tc->flags |= TCP_CONN_SNDACK;
return TCP_ERROR_FIFO_FULL;
}
@@ -1101,25 +1110,17 @@
goto done;
}
- if (PREDICT_FALSE (error == TCP_ERROR_FIFO_FULL))
- *next0 = TCP_NEXT_DROP;
-
/* Check if ACK can be delayed */
- if (!tcp_can_delack (tc))
- {
- /* Nothing to do for pure ACKs XXX */
- if (n_data_bytes == 0)
- goto done;
-
- *next0 = tcp_next_output (tc->c_is_ip4);
- tcp_make_ack (tc, b);
- }
- else
+ if (tcp_can_delack (tc))
{
if (!tcp_timer_is_active (tc, TCP_TIMER_DELACK))
tcp_timer_set (tc, TCP_TIMER_DELACK, TCP_DELACK_TIME);
+ goto done;
}
+ *next0 = tcp_next_output (tc->c_is_ip4);
+ tcp_make_ack (tc, b);
+
done:
return error;
}
@@ -2084,6 +2085,7 @@
child0->irs = vnet_buffer (b0)->tcp.seq_number;
child0->rcv_nxt = vnet_buffer (b0)->tcp.seq_number + 1;
+ child0->rcv_las = child0->rcv_nxt;
child0->state = TCP_STATE_SYN_RCVD;
/* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK}
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index a671f72..ea157bd 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -155,8 +155,7 @@
max_fifo = stream_session_fifo_size (&tc->connection);
ASSERT (tc->opt.mss < max_fifo);
-
- if (available_space < tc->opt.mss && available_space < max_fifo / 8)
+ if (available_space < tc->opt.mss && available_space < max_fifo >> 3)
available_space = 0;
/*
@@ -170,16 +169,21 @@
/* Bad. Thou shalt not shrink */
if (available_space < observed_wnd)
{
- /* Does happen! */
wnd = observed_wnd;
+ TCP_EVT_DBG (TCP_EVT_RCV_WND_SHRUNK, tc, observed_wnd, available_space);
}
else
{
wnd = available_space;
}
- if (wnd && ((wnd << tc->rcv_wscale) >> tc->rcv_wscale != wnd))
- wnd += 1 << tc->rcv_wscale;
+ /* Make sure we have a multiple of rcv_wscale */
+ if (wnd && tc->rcv_wscale)
+ {
+ wnd &= ~(1 << tc->rcv_wscale);
+ if (wnd == 0)
+ wnd = 1 << tc->rcv_wscale;
+ }
tc->rcv_wnd = clib_min (wnd, TCP_WND_MAX << tc->rcv_wscale);
}
@@ -462,8 +466,9 @@
tcp_reuse_buffer (vm, b);
tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_ACK);
- vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK;
TCP_EVT_DBG (TCP_EVT_ACK_SENT, tc);
+ vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK;
+ tc->rcv_las = tc->rcv_nxt;
}
/**
@@ -908,6 +913,7 @@
vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
tc->snd_nxt += data_len;
+
/* TODO this is updated in output as well ... */
if (tc->snd_nxt > tc->snd_una_max)
tc->snd_una_max = tc->snd_nxt;
@@ -929,7 +935,6 @@
/* Fill in the ACK */
tcp_make_ack (tc, b);
-
tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
}
@@ -942,7 +947,6 @@
tc = tcp_connection_get (index, thread_index);
tc->timers[TCP_TIMER_DELACK] = TCP_TIMER_HANDLE_INVALID;
-// tc->flags &= ~TCP_CONN_DELACK;
tcp_send_ack (tc);
}
@@ -995,7 +999,7 @@
* Reset congestion control, switch cwnd to loss window and try again.
*/
static void
-tcp_rtx_timeout_cc_recover (tcp_connection_t * tc)
+tcp_rtx_timeout_cc (tcp_connection_t * tc)
{
/* Cleanly recover cc (also clears up fast retransmit) */
if (tcp_in_fastrecovery (tc))
@@ -1008,6 +1012,7 @@
}
/* Start again from the beginning */
+ tcp_recovery_on (tc);
tc->cwnd = tcp_loss_wnd (tc);
tc->snd_congestion = tc->snd_una_max;
}
@@ -1048,7 +1053,7 @@
{
/* First retransmit timeout */
if (tc->rto_boff == 1)
- tcp_rtx_timeout_cc_recover (tc);
+ tcp_rtx_timeout_cc (tc);
/* Exponential backoff */
tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
@@ -1114,6 +1119,8 @@
{
ASSERT (tc->state == TCP_STATE_SYN_SENT);
+ TCP_EVT_DBG (TCP_EVT_SYN_RTX, tc);
+
/* This goes straight to ipx_lookup */
tcp_push_ip_hdr (tm, tc, b);
tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4);
@@ -1137,6 +1144,55 @@
}
/**
+ * Got 0 snd_wnd from peer, try to do something about it.
+ *
+ */
+void
+tcp_timer_persist_handler (u32 index)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ vlib_main_t *vm = vlib_get_main ();
+ u32 thread_index = os_get_cpu_number ();
+ tcp_connection_t *tc;
+ vlib_buffer_t *b;
+ u32 bi, n_bytes;
+
+ tc = tcp_connection_get (index, thread_index);
+
+ /* Make sure timer handle is set to invalid */
+ tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID;
+
+ /* Problem already solved or worse */
+ if (tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc))
+ return;
+
+ /* Increment RTO backoff */
+ tc->rto_boff += 1;
+ tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
+
+ /* Try to force the first unsent segment */
+ tcp_get_free_buffer_index (tm, &bi);
+ b = vlib_get_buffer (vm, bi);
+ n_bytes = stream_session_peek_bytes (&tc->connection,
+ vlib_buffer_get_current (b),
+ tc->snd_una_max - tc->snd_una,
+ tc->snd_mss);
+ /* Nothing to send */
+ if (n_bytes == 0)
+ {
+ tcp_return_buffer (tm);
+ return;
+ }
+
+ b->current_length = n_bytes;
+ tcp_push_hdr_i (tc, b, tc->state);
+ tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
+
+ /* Re-enable persist timer */
+ tcp_persist_timer_set (tc);
+}
+
+/**
* Retransmit first unacked segment
*/
void
@@ -1329,9 +1385,6 @@
}
}
- /* Retransmitted SYNs do reach this but it should be harmless */
- tc0->rcv_las = tc0->rcv_nxt;
-
/* Stop DELACK timer and fix flags */
tc0->flags &= ~(TCP_CONN_SNDACK);
if (tcp_timer_is_active (tc0, TCP_TIMER_DELACK))