tcp: move features to separate files
Type: refactor
Signed-off-by: Florin Coras <fcoras@cisco.com>
Change-Id: Ia477b8dba9266f47907967e363c11048e5cd95ab
diff --git a/src/vnet/CMakeLists.txt b/src/vnet/CMakeLists.txt
index 1574c3d..21780e2 100644
--- a/src/vnet/CMakeLists.txt
+++ b/src/vnet/CMakeLists.txt
@@ -659,9 +659,11 @@
tcp/tcp_output.c
tcp/tcp_input.c
tcp/tcp_newreno.c
- tcp/tcp_cubic.c
tcp/tcp_bt.c
+ tcp/tcp_cli.c
+ tcp/tcp_cubic.c
tcp/tcp_debug.c
+ tcp/tcp_sack.c
tcp/tcp.c
)
@@ -674,7 +676,12 @@
list(APPEND VNET_HEADERS
tcp/tcp_packet.h
tcp/tcp_timer.h
+ tcp/tcp_bt.h
+ tcp/tcp_cc.h
tcp/tcp_debug.h
+ tcp/tcp_inlines.h
+ tcp/tcp_sack.h
+ tcp/tcp_types.h
tcp/tcp.h
tcp/tcp_error.def
)
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
index a1d774d..2ac938a 100644
--- a/src/vnet/tcp/tcp.c
+++ b/src/vnet/tcp/tcp.c
@@ -19,11 +19,10 @@
*/
#include <vnet/tcp/tcp.h>
+#include <vnet/tcp/tcp_inlines.h>
#include <vnet/session/session.h>
#include <vnet/fib/fib.h>
#include <vnet/dpo/load_balance.h>
-#include <vnet/dpo/receive_dpo.h>
-#include <vnet/ip-neighbor/ip_neighbor.h>
#include <math.h>
tcp_main_t tcp_main;
@@ -707,51 +706,6 @@
tc->start_ts = tcp_time_now_us (tc->c_thread_index);
}
-void
-tcp_init_w_buffer (tcp_connection_t * tc, vlib_buffer_t * b, u8 is_ip4)
-{
- tcp_header_t *th = tcp_buffer_hdr (b);
-
- tc->c_lcl_port = th->dst_port;
- tc->c_rmt_port = th->src_port;
- tc->c_is_ip4 = is_ip4;
-
- if (is_ip4)
- {
- ip4_header_t *ip4 = vlib_buffer_get_current (b);
- tc->c_lcl_ip4.as_u32 = ip4->dst_address.as_u32;
- tc->c_rmt_ip4.as_u32 = ip4->src_address.as_u32;
- }
- else
- {
- ip6_header_t *ip6 = vlib_buffer_get_current (b);
- clib_memcpy_fast (&tc->c_lcl_ip6, &ip6->dst_address,
- sizeof (ip6_address_t));
- clib_memcpy_fast (&tc->c_rmt_ip6, &ip6->src_address,
- sizeof (ip6_address_t));
- }
-
- tc->irs = vnet_buffer (b)->tcp.seq_number;
- tc->rcv_nxt = vnet_buffer (b)->tcp.seq_number + 1;
- tc->rcv_las = tc->rcv_nxt;
- tc->sw_if_index = vnet_buffer (b)->sw_if_index[VLIB_RX];
- tc->snd_wl1 = vnet_buffer (b)->tcp.seq_number;
- tc->snd_wl2 = vnet_buffer (b)->tcp.ack_number;
-
- /* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK}
- * segments are used to initialize PAWS. */
- if (tcp_opts_tstamp (&tc->rcv_opts))
- {
- tc->tsval_recent = tc->rcv_opts.tsval;
- tc->tsval_recent_age = tcp_time_now ();
- }
-
- if (tcp_opts_wscale (&tc->rcv_opts))
- tc->snd_wscale = tc->rcv_opts.wscale;
-
- tc->snd_wnd = clib_net_to_host_u16 (th->window) << tc->snd_wscale;
-}
-
static int
tcp_alloc_custom_local_endpoint (tcp_main_t * tm, ip46_address_t * lcl_addr,
u16 * lcl_port, u8 is_ip4)
@@ -831,256 +785,6 @@
return tc->c_c_index;
}
-const char *tcp_fsm_states[] = {
-#define _(sym, str) str,
- foreach_tcp_fsm_state
-#undef _
-};
-
-u8 *
-format_tcp_state (u8 * s, va_list * args)
-{
- u32 state = va_arg (*args, u32);
-
- if (state < TCP_N_STATES)
- s = format (s, "%s", tcp_fsm_states[state]);
- else
- s = format (s, "UNKNOWN (%d (0x%x))", state, state);
- return s;
-}
-
-const char *tcp_cfg_flags_str[] = {
-#define _(sym, str) str,
- foreach_tcp_cfg_flag
-#undef _
-};
-
-static u8 *
-format_tcp_cfg_flags (u8 * s, va_list * args)
-{
- tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
- int i, last = -1;
-
- for (i = 0; i < TCP_CFG_N_FLAG_BITS; i++)
- if (tc->cfg_flags & (1 << i))
- last = i;
- for (i = 0; i < last; i++)
- {
- if (tc->cfg_flags & (1 << i))
- s = format (s, "%s, ", tcp_cfg_flags_str[i]);
- }
- if (last >= 0)
- s = format (s, "%s", tcp_cfg_flags_str[last]);
- return s;
-}
-
-const char *tcp_connection_flags_str[] = {
-#define _(sym, str) str,
- foreach_tcp_connection_flag
-#undef _
-};
-
-static u8 *
-format_tcp_connection_flags (u8 * s, va_list * args)
-{
- tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
- int i, last = -1;
-
- for (i = 0; i < TCP_CONN_N_FLAG_BITS; i++)
- if (tc->flags & (1 << i))
- last = i;
- for (i = 0; i < last; i++)
- {
- if (tc->flags & (1 << i))
- s = format (s, "%s, ", tcp_connection_flags_str[i]);
- }
- if (last >= 0)
- s = format (s, "%s", tcp_connection_flags_str[last]);
- return s;
-}
-
-const char *tcp_conn_timers[] = {
-#define _(sym, str) str,
- foreach_tcp_timer
-#undef _
-};
-
-static u8 *
-format_tcp_timers (u8 * s, va_list * args)
-{
- tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
- int i, last = -1;
-
- for (i = 0; i < TCP_N_TIMERS; i++)
- if (tc->timers[i] != TCP_TIMER_HANDLE_INVALID)
- last = i;
-
- for (i = 0; i < last; i++)
- {
- if (tc->timers[i] != TCP_TIMER_HANDLE_INVALID)
- s = format (s, "%s,", tcp_conn_timers[i]);
- }
-
- if (last >= 0)
- s = format (s, "%s", tcp_conn_timers[i]);
-
- return s;
-}
-
-static u8 *
-format_tcp_congestion_status (u8 * s, va_list * args)
-{
- tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
- if (tcp_in_recovery (tc))
- s = format (s, "recovery");
- else if (tcp_in_fastrecovery (tc))
- s = format (s, "fastrecovery");
- else
- s = format (s, "none");
- return s;
-}
-
-static i32
-tcp_rcv_wnd_available (tcp_connection_t * tc)
-{
- return (i32) tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las);
-}
-
-static u8 *
-format_tcp_congestion (u8 * s, va_list * args)
-{
- tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
- u32 indent = format_get_indent (s), prr_space = 0;
-
- s = format (s, "%U ", format_tcp_congestion_status, tc);
- s = format (s, "algo %s cwnd %u ssthresh %u bytes_acked %u\n",
- tc->cc_algo->name, tc->cwnd, tc->ssthresh, tc->bytes_acked);
- s = format (s, "%Ucc space %u prev_cwnd %u prev_ssthresh %u\n",
- format_white_space, indent, tcp_available_cc_snd_space (tc),
- tc->prev_cwnd, tc->prev_ssthresh);
- s = format (s, "%Usnd_cong %u dupack %u limited_tx %u\n",
- format_white_space, indent, tc->snd_congestion - tc->iss,
- tc->rcv_dupacks, tc->limited_transmit - tc->iss);
- s = format (s, "%Urxt_bytes %u rxt_delivered %u rxt_head %u rxt_ts %u\n",
- format_white_space, indent, tc->snd_rxt_bytes,
- tc->rxt_delivered, tc->rxt_head - tc->iss,
- tcp_time_now_w_thread (tc->c_thread_index) - tc->snd_rxt_ts);
- if (tcp_in_fastrecovery (tc))
- prr_space = tcp_fastrecovery_prr_snd_space (tc);
- s = format (s, "%Uprr_start %u prr_delivered %u prr space %u\n",
- format_white_space, indent, tc->prr_start - tc->iss,
- tc->prr_delivered, prr_space);
- return s;
-}
-
-static u8 *
-format_tcp_stats (u8 * s, va_list * args)
-{
- tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
- u32 indent = format_get_indent (s);
- s = format (s, "in segs %lu dsegs %lu bytes %lu dupacks %u\n",
- tc->segs_in, tc->data_segs_in, tc->bytes_in, tc->dupacks_in);
- s = format (s, "%Uout segs %lu dsegs %lu bytes %lu dupacks %u\n",
- format_white_space, indent, tc->segs_out,
- tc->data_segs_out, tc->bytes_out, tc->dupacks_out);
- s = format (s, "%Ufr %u tr %u rxt segs %lu bytes %lu duration %.3f\n",
- format_white_space, indent, tc->fr_occurences,
- tc->tr_occurences, tc->segs_retrans, tc->bytes_retrans,
- tcp_time_now_us (tc->c_thread_index) - tc->start_ts);
- s = format (s, "%Uerr wnd data below %u above %u ack below %u above %u",
- format_white_space, indent, tc->errors.below_data_wnd,
- tc->errors.above_data_wnd, tc->errors.below_ack_wnd,
- tc->errors.above_ack_wnd);
- return s;
-}
-
-static u8 *
-format_tcp_vars (u8 * s, va_list * args)
-{
- tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
- s = format (s, " index: %u cfg: %U flags: %U timers: %U\n", tc->c_c_index,
- format_tcp_cfg_flags, tc, format_tcp_connection_flags, tc,
- format_tcp_timers, tc);
- s = format (s, " snd_una %u snd_nxt %u snd_una_max %u",
- tc->snd_una - tc->iss, tc->snd_nxt - tc->iss,
- tc->snd_una_max - tc->iss);
- s = format (s, " rcv_nxt %u rcv_las %u\n",
- tc->rcv_nxt - tc->irs, tc->rcv_las - tc->irs);
- s = format (s, " snd_wnd %u rcv_wnd %u rcv_wscale %u ",
- tc->snd_wnd, tc->rcv_wnd, tc->rcv_wscale);
- s = format (s, "snd_wl1 %u snd_wl2 %u\n", tc->snd_wl1 - tc->irs,
- tc->snd_wl2 - tc->iss);
- s = format (s, " flight size %u out space %u rcv_wnd_av %u",
- tcp_flight_size (tc), tcp_available_output_snd_space (tc),
- tcp_rcv_wnd_available (tc));
- s = format (s, " tsval_recent %u\n", tc->tsval_recent);
- s = format (s, " tsecr %u tsecr_last_ack %u tsval_recent_age %u",
- tc->rcv_opts.tsecr, tc->tsecr_last_ack,
- tcp_time_now () - tc->tsval_recent_age);
- s = format (s, " snd_mss %u\n", tc->snd_mss);
- s = format (s, " rto %u rto_boff %u srtt %u us %.3f rttvar %u rtt_ts %.4f",
- tc->rto, tc->rto_boff, tc->srtt, tc->mrtt_us * 1000, tc->rttvar,
- tc->rtt_ts);
- s = format (s, " rtt_seq %u\n", tc->rtt_seq - tc->iss);
- s = format (s, " next_node %u opaque 0x%x fib_index %u\n",
- tc->next_node_index, tc->next_node_opaque, tc->c_fib_index);
- s = format (s, " cong: %U", format_tcp_congestion, tc);
-
- if (tc->state >= TCP_STATE_ESTABLISHED)
- {
- s = format (s, " sboard: %U\n", format_tcp_scoreboard, &tc->sack_sb,
- tc);
- s = format (s, " stats: %U\n", format_tcp_stats, tc);
- }
- if (vec_len (tc->snd_sacks))
- s = format (s, " sacks tx: %U\n", format_tcp_sacks, tc);
-
- return s;
-}
-
-u8 *
-format_tcp_connection_id (u8 * s, va_list * args)
-{
- tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
- if (!tc)
- return s;
- if (tc->c_is_ip4)
- {
- s = format (s, "[%d:%d][%s] %U:%d->%U:%d", tc->c_thread_index,
- tc->c_s_index, "T", format_ip4_address, &tc->c_lcl_ip4,
- clib_net_to_host_u16 (tc->c_lcl_port), format_ip4_address,
- &tc->c_rmt_ip4, clib_net_to_host_u16 (tc->c_rmt_port));
- }
- else
- {
- s = format (s, "[%d:%d][%s] %U:%d->%U:%d", tc->c_thread_index,
- tc->c_s_index, "T", format_ip6_address, &tc->c_lcl_ip6,
- clib_net_to_host_u16 (tc->c_lcl_port), format_ip6_address,
- &tc->c_rmt_ip6, clib_net_to_host_u16 (tc->c_rmt_port));
- }
-
- return s;
-}
-
-u8 *
-format_tcp_connection (u8 * s, va_list * args)
-{
- tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
- u32 verbose = va_arg (*args, u32);
-
- if (!tc)
- return s;
- s = format (s, "%-50U", format_tcp_connection_id, tc);
- if (verbose)
- {
- s = format (s, "%-15U", format_tcp_state, tc->state);
- if (verbose > 1)
- s = format (s, "\n%U", format_tcp_vars, tc);
- }
-
- return s;
-}
-
static u8 *
format_tcp_session (u8 * s, va_list * args)
{
@@ -1119,100 +823,6 @@
return format (s, "%U", format_tcp_connection_id, tc);
}
-u8 *
-format_tcp_sacks (u8 * s, va_list * args)
-{
- tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
- sack_block_t *sacks = tc->snd_sacks;
- sack_block_t *block;
- int i, len = 0;
-
- len = vec_len (sacks);
- for (i = 0; i < len - 1; i++)
- {
- block = &sacks[i];
- s = format (s, " start %u end %u\n", block->start - tc->irs,
- block->end - tc->irs);
- }
- if (len)
- {
- block = &sacks[len - 1];
- s = format (s, " start %u end %u", block->start - tc->irs,
- block->end - tc->irs);
- }
- return s;
-}
-
-u8 *
-format_tcp_rcv_sacks (u8 * s, va_list * args)
-{
- tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
- sack_block_t *sacks = tc->rcv_opts.sacks;
- sack_block_t *block;
- int i, len = 0;
-
- len = vec_len (sacks);
- for (i = 0; i < len - 1; i++)
- {
- block = &sacks[i];
- s = format (s, " start %u end %u\n", block->start - tc->iss,
- block->end - tc->iss);
- }
- if (len)
- {
- block = &sacks[len - 1];
- s = format (s, " start %u end %u", block->start - tc->iss,
- block->end - tc->iss);
- }
- return s;
-}
-
-static u8 *
-format_tcp_sack_hole (u8 * s, va_list * args)
-{
- sack_scoreboard_hole_t *hole = va_arg (*args, sack_scoreboard_hole_t *);
- tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
- if (tc)
- s = format (s, " [%u, %u]", hole->start - tc->iss, hole->end - tc->iss);
- else
- s = format (s, " [%u, %u]", hole->start, hole->end);
- return s;
-}
-
-u8 *
-format_tcp_scoreboard (u8 * s, va_list * args)
-{
- sack_scoreboard_t *sb = va_arg (*args, sack_scoreboard_t *);
- tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
- sack_scoreboard_hole_t *hole;
- u32 indent = format_get_indent (s);
-
- s = format (s, "sacked %u last_sacked %u lost %u last_lost %u"
- " rxt_sacked %u\n",
- sb->sacked_bytes, sb->last_sacked_bytes, sb->lost_bytes,
- sb->last_lost_bytes, sb->rxt_sacked);
- s = format (s, "%Ulast_delivered %u high_sacked %u is_reneging %u\n",
- format_white_space, indent, sb->last_bytes_delivered,
- sb->high_sacked - tc->iss, sb->is_reneging);
- s = format (s, "%Ucur_rxt_hole %u high_rxt %u rescue_rxt %u",
- format_white_space, indent, sb->cur_rxt_hole,
- sb->high_rxt - tc->iss, sb->rescue_rxt - tc->iss);
-
- hole = scoreboard_first_hole (sb);
- if (hole)
- s = format (s, "\n%Uhead %u tail %u %u holes:\n%U", format_white_space,
- indent, sb->head, sb->tail, pool_elts (sb->holes),
- format_white_space, indent);
-
- while (hole)
- {
- s = format (s, "%U", format_tcp_sack_hole, hole, tc);
- hole = scoreboard_next_hole (sb, hole);
- }
-
- return s;
-}
-
static transport_connection_t *
tcp_session_get_transport (u32 conn_index, u32 thread_index)
{
@@ -1813,7 +1423,6 @@
transport_register_protocol (TRANSPORT_PROTO_TCP, &tcp_proto,
FIB_PROTOCOL_IP6, tcp6_output_node.index);
- tcp_api_reference ();
tcp_configuration_init ();
tm->cc_algo_by_name = hash_create_string (0, sizeof (uword));
@@ -1823,666 +1432,6 @@
VLIB_INIT_FUNCTION (tcp_init);
-uword
-unformat_tcp_cc_algo (unformat_input_t * input, va_list * va)
-{
- tcp_cc_algorithm_type_e *result = va_arg (*va, tcp_cc_algorithm_type_e *);
- tcp_main_t *tm = &tcp_main;
- char *cc_algo_name;
- u8 found = 0;
- uword *p;
-
- if (unformat (input, "%s", &cc_algo_name)
- && ((p = hash_get_mem (tm->cc_algo_by_name, cc_algo_name))))
- {
- *result = *p;
- found = 1;
- }
-
- vec_free (cc_algo_name);
- return found;
-}
-
-uword
-unformat_tcp_cc_algo_cfg (unformat_input_t * input, va_list * va)
-{
- tcp_main_t *tm = vnet_get_tcp_main ();
- tcp_cc_algorithm_t *cc_alg;
- unformat_input_t sub_input;
- int found = 0;
-
- vec_foreach (cc_alg, tm->cc_algos)
- {
- if (!unformat (input, cc_alg->name))
- continue;
-
- if (cc_alg->unformat_cfg
- && unformat (input, "%U", unformat_vlib_cli_sub_input, &sub_input))
- {
- if (cc_alg->unformat_cfg (&sub_input))
- found = 1;
- }
- }
- return found;
-}
-
-static clib_error_t *
-tcp_config_fn (vlib_main_t * vm, unformat_input_t * input)
-{
- u32 cwnd_multiplier, tmp_time;
- uword memory_size;
-
- while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
- {
- if (unformat (input, "preallocated-connections %d",
- &tcp_cfg.preallocated_connections))
- ;
- else if (unformat (input, "preallocated-half-open-connections %d",
- &tcp_cfg.preallocated_half_open_connections))
- ;
- else if (unformat (input, "buffer-fail-fraction %f",
- &tcp_cfg.buffer_fail_fraction))
- ;
- else if (unformat (input, "max-rx-fifo %U", unformat_memory_size,
- &memory_size))
- {
- if (memory_size >= 0x100000000)
- {
- return clib_error_return
- (0, "max-rx-fifo %llu (0x%llx) too large", memory_size,
- memory_size);
- }
- tcp_cfg.max_rx_fifo = memory_size;
- }
- else if (unformat (input, "min-rx-fifo %U", unformat_memory_size,
- &memory_size))
- {
- if (memory_size >= 0x100000000)
- {
- return clib_error_return
- (0, "min-rx-fifo %llu (0x%llx) too large", memory_size,
- memory_size);
- }
- tcp_cfg.min_rx_fifo = memory_size;
- }
- else if (unformat (input, "mtu %u", &tcp_cfg.default_mtu))
- ;
- else if (unformat (input, "rwnd-min-update-ack %d",
- &tcp_cfg.rwnd_min_update_ack))
- ;
- else if (unformat (input, "initial-cwnd-multiplier %u",
- &cwnd_multiplier))
- tcp_cfg.initial_cwnd_multiplier = cwnd_multiplier;
- else if (unformat (input, "no-tx-pacing"))
- tcp_cfg.enable_tx_pacing = 0;
- else if (unformat (input, "tso"))
- tcp_cfg.allow_tso = 1;
- else if (unformat (input, "no-csum-offload"))
- tcp_cfg.csum_offload = 0;
- else if (unformat (input, "cc-algo %U", unformat_tcp_cc_algo,
- &tcp_cfg.cc_algo))
- ;
- else if (unformat (input, "%U", unformat_tcp_cc_algo_cfg))
- ;
- else if (unformat (input, "closewait-time %u", &tmp_time))
- tcp_cfg.closewait_time = tmp_time / TCP_TIMER_TICK;
- else if (unformat (input, "timewait-time %u", &tmp_time))
- tcp_cfg.timewait_time = tmp_time / TCP_TIMER_TICK;
- else if (unformat (input, "finwait1-time %u", &tmp_time))
- tcp_cfg.finwait1_time = tmp_time / TCP_TIMER_TICK;
- else if (unformat (input, "finwait2-time %u", &tmp_time))
- tcp_cfg.finwait2_time = tmp_time / TCP_TIMER_TICK;
- else if (unformat (input, "lastack-time %u", &tmp_time))
- tcp_cfg.lastack_time = tmp_time / TCP_TIMER_TICK;
- else if (unformat (input, "closing-time %u", &tmp_time))
- tcp_cfg.closing_time = tmp_time / TCP_TIMER_TICK;
- else if (unformat (input, "cleanup-time %u", &tmp_time))
- tcp_cfg.cleanup_time = tmp_time / 1000.0;
- else
- return clib_error_return (0, "unknown input `%U'",
- format_unformat_error, input);
- }
- return 0;
-}
-
-VLIB_CONFIG_FUNCTION (tcp_config_fn, "tcp");
-
-
-/**
- * \brief Configure an ipv4 source address range
- * @param vm vlib_main_t pointer
- * @param start first ipv4 address in the source address range
- * @param end last ipv4 address in the source address range
- * @param table_id VRF / table ID, 0 for the default FIB
- * @return 0 if all OK, else an error indication from api_errno.h
- */
-
-int
-tcp_configure_v4_source_address_range (vlib_main_t * vm,
- ip4_address_t * start,
- ip4_address_t * end, u32 table_id)
-{
- u32 start_host_byte_order, end_host_byte_order;
- fib_prefix_t prefix;
- fib_node_index_t fei;
- u32 fib_index = 0;
- u32 sw_if_index;
- int rv;
-
- clib_memset (&prefix, 0, sizeof (prefix));
-
- fib_index = fib_table_find (FIB_PROTOCOL_IP4, table_id);
-
- if (fib_index == ~0)
- return VNET_API_ERROR_NO_SUCH_FIB;
-
- start_host_byte_order = clib_net_to_host_u32 (start->as_u32);
- end_host_byte_order = clib_net_to_host_u32 (end->as_u32);
-
- /* sanity check for reversed args or some such */
- if ((end_host_byte_order - start_host_byte_order) > (10 << 10))
- return VNET_API_ERROR_INVALID_ARGUMENT;
-
- /* Lookup the last address, to identify the interface involved */
- prefix.fp_len = 32;
- prefix.fp_proto = FIB_PROTOCOL_IP4;
- memcpy (&prefix.fp_addr.ip4, end, sizeof (ip4_address_t));
-
- fei = fib_table_lookup (fib_index, &prefix);
-
- /* Couldn't find route to destination. Bail out. */
- if (fei == FIB_NODE_INDEX_INVALID)
- return VNET_API_ERROR_NEXT_HOP_NOT_IN_FIB;
-
- sw_if_index = fib_entry_get_resolving_interface (fei);
-
- /* Configure proxy arp across the range */
- rv = ip4_neighbor_proxy_add (fib_index, start, end);
-
- if (rv)
- return rv;
-
- rv = ip4_neighbor_proxy_enable (sw_if_index);
-
- if (rv)
- return rv;
-
- do
- {
- dpo_id_t dpo = DPO_INVALID;
-
- vec_add1 (tcp_cfg.ip4_src_addrs, start[0]);
-
- /* Add local adjacencies for the range */
-
- receive_dpo_add_or_lock (DPO_PROTO_IP4, ~0 /* sw_if_index */ ,
- NULL, &dpo);
- prefix.fp_len = 32;
- prefix.fp_proto = FIB_PROTOCOL_IP4;
- prefix.fp_addr.ip4.as_u32 = start->as_u32;
-
- fib_table_entry_special_dpo_update (fib_index,
- &prefix,
- FIB_SOURCE_API,
- FIB_ENTRY_FLAG_EXCLUSIVE, &dpo);
- dpo_reset (&dpo);
-
- start_host_byte_order++;
- start->as_u32 = clib_host_to_net_u32 (start_host_byte_order);
- }
- while (start_host_byte_order <= end_host_byte_order);
-
- return 0;
-}
-
-/**
- * \brief Configure an ipv6 source address range
- * @param vm vlib_main_t pointer
- * @param start first ipv6 address in the source address range
- * @param end last ipv6 address in the source address range
- * @param table_id VRF / table ID, 0 for the default FIB
- * @return 0 if all OK, else an error indication from api_errno.h
- */
-
-int
-tcp_configure_v6_source_address_range (vlib_main_t * vm,
- ip6_address_t * start,
- ip6_address_t * end, u32 table_id)
-{
- fib_prefix_t prefix;
- u32 fib_index = 0;
- fib_node_index_t fei;
- u32 sw_if_index;
-
- clib_memset (&prefix, 0, sizeof (prefix));
-
- fib_index = fib_table_find (FIB_PROTOCOL_IP6, table_id);
-
- if (fib_index == ~0)
- return VNET_API_ERROR_NO_SUCH_FIB;
-
- while (1)
- {
- int i;
- ip6_address_t tmp;
- dpo_id_t dpo = DPO_INVALID;
-
- /* Remember this address */
- vec_add1 (tcp_cfg.ip6_src_addrs, start[0]);
-
- /* Lookup the prefix, to identify the interface involved */
- prefix.fp_len = 128;
- prefix.fp_proto = FIB_PROTOCOL_IP6;
- memcpy (&prefix.fp_addr.ip6, start, sizeof (ip6_address_t));
-
- fei = fib_table_lookup (fib_index, &prefix);
-
- /* Couldn't find route to destination. Bail out. */
- if (fei == FIB_NODE_INDEX_INVALID)
- return VNET_API_ERROR_NEXT_HOP_NOT_IN_FIB;
-
- sw_if_index = fib_entry_get_resolving_interface (fei);
-
- if (sw_if_index == (u32) ~ 0)
- return VNET_API_ERROR_NO_MATCHING_INTERFACE;
-
- /* Add a proxy neighbor discovery entry for this address */
- ip6_neighbor_proxy_add (sw_if_index, start);
-
- /* Add a receive adjacency for this address */
- receive_dpo_add_or_lock (DPO_PROTO_IP6, ~0 /* sw_if_index */ ,
- NULL, &dpo);
-
- fib_table_entry_special_dpo_update (fib_index,
- &prefix,
- FIB_SOURCE_API,
- FIB_ENTRY_FLAG_EXCLUSIVE, &dpo);
- dpo_reset (&dpo);
-
- /* Done with the entire range? */
- if (!memcmp (start, end, sizeof (start[0])))
- break;
-
- /* Increment the address. DGMS. */
- tmp = start[0];
- for (i = 15; i >= 0; i--)
- {
- tmp.as_u8[i] += 1;
- if (tmp.as_u8[i] != 0)
- break;
- }
- start[0] = tmp;
- }
- return 0;
-}
-
-static clib_error_t *
-tcp_src_address_fn (vlib_main_t * vm,
- unformat_input_t * input, vlib_cli_command_t * cmd_arg)
-{
- ip4_address_t v4start, v4end;
- ip6_address_t v6start, v6end;
- u32 table_id = 0;
- int v4set = 0;
- int v6set = 0;
- int rv;
-
- while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
- {
- if (unformat (input, "%U - %U", unformat_ip4_address, &v4start,
- unformat_ip4_address, &v4end))
- v4set = 1;
- else if (unformat (input, "%U", unformat_ip4_address, &v4start))
- {
- memcpy (&v4end, &v4start, sizeof (v4start));
- v4set = 1;
- }
- else if (unformat (input, "%U - %U", unformat_ip6_address, &v6start,
- unformat_ip6_address, &v6end))
- v6set = 1;
- else if (unformat (input, "%U", unformat_ip6_address, &v6start))
- {
- memcpy (&v6end, &v6start, sizeof (v6start));
- v6set = 1;
- }
- else if (unformat (input, "fib-table %d", &table_id))
- ;
- else
- break;
- }
-
- if (!v4set && !v6set)
- return clib_error_return (0, "at least one v4 or v6 address required");
-
- if (v4set)
- {
- rv = tcp_configure_v4_source_address_range (vm, &v4start, &v4end,
- table_id);
- switch (rv)
- {
- case 0:
- break;
-
- case VNET_API_ERROR_NO_SUCH_FIB:
- return clib_error_return (0, "Invalid table-id %d", table_id);
-
- case VNET_API_ERROR_INVALID_ARGUMENT:
- return clib_error_return (0, "Invalid address range %U - %U",
- format_ip4_address, &v4start,
- format_ip4_address, &v4end);
- default:
- return clib_error_return (0, "error %d", rv);
- break;
- }
- }
- if (v6set)
- {
- rv = tcp_configure_v6_source_address_range (vm, &v6start, &v6end,
- table_id);
- switch (rv)
- {
- case 0:
- break;
-
- case VNET_API_ERROR_NO_SUCH_FIB:
- return clib_error_return (0, "Invalid table-id %d", table_id);
-
- default:
- return clib_error_return (0, "error %d", rv);
- break;
- }
- }
- return 0;
-}
-
-/* *INDENT-OFF* */
-VLIB_CLI_COMMAND (tcp_src_address_command, static) =
-{
- .path = "tcp src-address",
- .short_help = "tcp src-address <ip-addr> [- <ip-addr>] add src address range",
- .function = tcp_src_address_fn,
-};
-/* *INDENT-ON* */
-
-static u8 *
-tcp_scoreboard_dump_trace (u8 * s, sack_scoreboard_t * sb)
-{
-#if TCP_SCOREBOARD_TRACE
-
- scoreboard_trace_elt_t *block;
- int i = 0;
-
- if (!sb->trace)
- return s;
-
- s = format (s, "scoreboard trace:");
- vec_foreach (block, sb->trace)
- {
- s = format (s, "{%u, %u, %u, %u, %u}, ", block->start, block->end,
- block->ack, block->snd_una_max, block->group);
- if ((++i % 3) == 0)
- s = format (s, "\n");
- }
- return s;
-#else
- return 0;
-#endif
-}
-
-static clib_error_t *
-tcp_show_scoreboard_trace_fn (vlib_main_t * vm, unformat_input_t * input,
- vlib_cli_command_t * cmd_arg)
-{
- transport_connection_t *tconn = 0;
- tcp_connection_t *tc;
- u8 *s = 0;
- while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
- {
- if (unformat (input, "%U", unformat_transport_connection, &tconn,
- TRANSPORT_PROTO_TCP))
- ;
- else
- return clib_error_return (0, "unknown input `%U'",
- format_unformat_error, input);
- }
-
- if (!TCP_SCOREBOARD_TRACE)
- {
- vlib_cli_output (vm, "scoreboard tracing not enabled");
- return 0;
- }
-
- tc = tcp_get_connection_from_transport (tconn);
- s = tcp_scoreboard_dump_trace (s, &tc->sack_sb);
- vlib_cli_output (vm, "%v", s);
- return 0;
-}
-
-/* *INDENT-OFF* */
-VLIB_CLI_COMMAND (tcp_show_scoreboard_trace_command, static) =
-{
- .path = "show tcp scoreboard trace",
- .short_help = "show tcp scoreboard trace <connection>",
- .function = tcp_show_scoreboard_trace_fn,
-};
-/* *INDENT-ON* */
-
-u8 *
-tcp_scoreboard_replay (u8 * s, tcp_connection_t * tc, u8 verbose)
-{
- int i, trace_len;
- scoreboard_trace_elt_t *trace;
- u32 next_ack, left, group, has_new_ack = 0;
- tcp_connection_t _dummy_tc, *dummy_tc = &_dummy_tc;
- sack_block_t *block;
-
- if (!TCP_SCOREBOARD_TRACE)
- {
- s = format (s, "scoreboard tracing not enabled");
- return s;
- }
-
- if (!tc)
- return s;
-
- clib_memset (dummy_tc, 0, sizeof (*dummy_tc));
- tcp_connection_timers_init (dummy_tc);
- scoreboard_init (&dummy_tc->sack_sb);
- dummy_tc->rcv_opts.flags |= TCP_OPTS_FLAG_SACK;
-
-#if TCP_SCOREBOARD_TRACE
- trace = tc->sack_sb.trace;
- trace_len = vec_len (tc->sack_sb.trace);
-#endif
-
- for (i = 0; i < trace_len; i++)
- {
- if (trace[i].ack != 0)
- {
- dummy_tc->snd_una = trace[i].ack - 1448;
- dummy_tc->snd_una_max = trace[i].ack;
- }
- }
-
- left = 0;
- while (left < trace_len)
- {
- group = trace[left].group;
- vec_reset_length (dummy_tc->rcv_opts.sacks);
- has_new_ack = 0;
- while (trace[left].group == group)
- {
- if (trace[left].ack != 0)
- {
- if (verbose)
- s = format (s, "Adding ack %u, snd_una_max %u, segs: ",
- trace[left].ack, trace[left].snd_una_max);
- dummy_tc->snd_una_max = trace[left].snd_una_max;
- next_ack = trace[left].ack;
- has_new_ack = 1;
- }
- else
- {
- if (verbose)
- s = format (s, "[%u, %u], ", trace[left].start,
- trace[left].end);
- vec_add2 (dummy_tc->rcv_opts.sacks, block, 1);
- block->start = trace[left].start;
- block->end = trace[left].end;
- }
- left++;
- }
-
- /* Push segments */
- tcp_rcv_sacks (dummy_tc, next_ack);
- if (has_new_ack)
- dummy_tc->snd_una = next_ack;
-
- if (verbose)
- s = format (s, "result: %U", format_tcp_scoreboard,
- &dummy_tc->sack_sb);
-
- }
- s = format (s, "result: %U", format_tcp_scoreboard, &dummy_tc->sack_sb);
-
- return s;
-}
-
-static clib_error_t *
-tcp_scoreboard_trace_fn (vlib_main_t * vm, unformat_input_t * input,
- vlib_cli_command_t * cmd_arg)
-{
- transport_connection_t *tconn = 0;
- tcp_connection_t *tc = 0;
- u8 *str = 0;
- while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
- {
- if (unformat (input, "%U", unformat_transport_connection, &tconn,
- TRANSPORT_PROTO_TCP))
- ;
- else
- return clib_error_return (0, "unknown input `%U'",
- format_unformat_error, input);
- }
-
- if (!TCP_SCOREBOARD_TRACE)
- {
- vlib_cli_output (vm, "scoreboard tracing not enabled");
- return 0;
- }
-
- tc = tcp_get_connection_from_transport (tconn);
- if (!tc)
- {
- vlib_cli_output (vm, "connection not found");
- return 0;
- }
- str = tcp_scoreboard_replay (str, tc, 1);
- vlib_cli_output (vm, "%v", str);
- return 0;
-}
-
-/* *INDENT-OFF* */
-VLIB_CLI_COMMAND (tcp_replay_scoreboard_command, static) =
-{
- .path = "tcp replay scoreboard",
- .short_help = "tcp replay scoreboard <connection>",
- .function = tcp_scoreboard_trace_fn,
-};
-/* *INDENT-ON* */
-
-static clib_error_t *
-show_tcp_punt_fn (vlib_main_t * vm, unformat_input_t * input,
- vlib_cli_command_t * cmd_arg)
-{
- tcp_main_t *tm = vnet_get_tcp_main ();
- if (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
- return clib_error_return (0, "unknown input `%U'", format_unformat_error,
- input);
- vlib_cli_output (vm, "IPv4 TCP punt: %s",
- tm->punt_unknown4 ? "enabled" : "disabled");
- vlib_cli_output (vm, "IPv6 TCP punt: %s",
- tm->punt_unknown6 ? "enabled" : "disabled");
- return 0;
-}
-/* *INDENT-OFF* */
-VLIB_CLI_COMMAND (show_tcp_punt_command, static) =
-{
- .path = "show tcp punt",
- .short_help = "show tcp punt",
- .function = show_tcp_punt_fn,
-};
-/* *INDENT-ON* */
-
-static clib_error_t *
-show_tcp_stats_fn (vlib_main_t * vm, unformat_input_t * input,
- vlib_cli_command_t * cmd)
-{
- tcp_main_t *tm = vnet_get_tcp_main ();
- tcp_worker_ctx_t *wrk;
- u32 thread;
-
- if (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
- return clib_error_return (0, "unknown input `%U'", format_unformat_error,
- input);
- for (thread = 0; thread < vec_len (tm->wrk_ctx); thread++)
- {
- wrk = tcp_get_worker (thread);
- vlib_cli_output (vm, "Thread %u:\n", thread);
-
- if (clib_fifo_elts (wrk->pending_timers))
- vlib_cli_output (vm, " %lu pending timers",
- clib_fifo_elts (wrk->pending_timers));
-
-#define _(name,type,str) \
- if (wrk->stats.name) \
- vlib_cli_output (vm, " %lu %s", wrk->stats.name, str);
- foreach_tcp_wrk_stat
-#undef _
- }
-
- return 0;
-}
-
-/* *INDENT-OFF* */
-VLIB_CLI_COMMAND (show_tcp_stats_command, static) =
-{
- .path = "show tcp stats",
- .short_help = "show tcp stats",
- .function = show_tcp_stats_fn,
-};
-/* *INDENT-ON* */
-
-static clib_error_t *
-clear_tcp_stats_fn (vlib_main_t * vm, unformat_input_t * input,
- vlib_cli_command_t * cmd)
-{
- tcp_main_t *tm = vnet_get_tcp_main ();
- tcp_worker_ctx_t *wrk;
- u32 thread;
-
- if (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
- return clib_error_return (0, "unknown input `%U'", format_unformat_error,
- input);
-
- for (thread = 0; thread < vec_len (tm->wrk_ctx); thread++)
- {
- wrk = tcp_get_worker (thread);
- clib_memset (&wrk->stats, 0, sizeof (wrk->stats));
- }
-
- return 0;
-}
-
-/* *INDENT-OFF* */
-VLIB_CLI_COMMAND (clear_tcp_stats_command, static) =
-{
- .path = "clear tcp stats",
- .short_help = "clear tcp stats",
- .function = clear_tcp_stats_fn,
-};
-/* *INDENT-ON* */
-
/*
* fd.io coding-style-patch-verification: ON
*
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index 30c95a4..708d756 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -18,464 +18,13 @@
#include <vnet/vnet.h>
#include <vnet/ip/ip.h>
-#include <vnet/tcp/tcp_packet.h>
-#include <vnet/tcp/tcp_timer.h>
-#include <vnet/session/transport.h>
#include <vnet/session/session.h>
+#include <vnet/tcp/tcp_types.h>
+#include <vnet/tcp/tcp_timer.h>
#include <vnet/tcp/tcp_debug.h>
-
-#define TCP_TICK 0.001 /**< TCP tick period (s) */
-#define THZ (u32) (1/TCP_TICK) /**< TCP tick frequency */
-#define TCP_TSTAMP_RESOLUTION TCP_TICK /**< Time stamp resolution */
-#define TCP_PAWS_IDLE 24 * 24 * 60 * 60 * THZ /**< 24 days */
-#define TCP_FIB_RECHECK_PERIOD 1 * THZ /**< Recheck every 1s */
-#define TCP_MAX_OPTION_SPACE 40
-#define TCP_CC_DATA_SZ 24
-#define TCP_MAX_GSO_SZ 65536
-#define TCP_RXT_MAX_BURST 10
-
-#define TCP_DUPACK_THRESHOLD 3
-#define TCP_IW_N_SEGMENTS 10
-#define TCP_ALWAYS_ACK 1 /**< On/off delayed acks */
-#define TCP_USE_SACKS 1 /**< Disable only for testing */
-
-/** TCP FSM state definitions as per RFC793. */
-#define foreach_tcp_fsm_state \
- _(CLOSED, "CLOSED") \
- _(LISTEN, "LISTEN") \
- _(SYN_SENT, "SYN_SENT") \
- _(SYN_RCVD, "SYN_RCVD") \
- _(ESTABLISHED, "ESTABLISHED") \
- _(CLOSE_WAIT, "CLOSE_WAIT") \
- _(FIN_WAIT_1, "FIN_WAIT_1") \
- _(LAST_ACK, "LAST_ACK") \
- _(CLOSING, "CLOSING") \
- _(FIN_WAIT_2, "FIN_WAIT_2") \
- _(TIME_WAIT, "TIME_WAIT")
-
-typedef enum _tcp_state
-{
-#define _(sym, str) TCP_STATE_##sym,
- foreach_tcp_fsm_state
-#undef _
- TCP_N_STATES
-} tcp_state_t;
-
-format_function_t format_tcp_state;
-format_function_t format_tcp_flags;
-format_function_t format_tcp_sacks;
-format_function_t format_tcp_rcv_sacks;
-
-/** TCP timers */
-#define foreach_tcp_timer \
- _(RETRANSMIT, "RETRANSMIT") \
- _(DELACK, "DELAYED ACK") \
- _(PERSIST, "PERSIST") \
- _(WAITCLOSE, "WAIT CLOSE") \
- _(RETRANSMIT_SYN, "RETRANSMIT SYN") \
-
-typedef enum _tcp_timers
-{
-#define _(sym, str) TCP_TIMER_##sym,
- foreach_tcp_timer
-#undef _
- TCP_N_TIMERS
-} tcp_timers_e;
-
-#define TCP_TIMER_HANDLE_INVALID ((u32) ~0)
-
-#define TCP_TIMER_TICK 0.1 /**< Timer tick in seconds */
-#define TCP_TO_TIMER_TICK TCP_TICK*10 /**< Factor for converting
- ticks to timer ticks */
-
-#define TCP_RTO_MAX 60 * THZ /* Min max RTO (60s) as per RFC6298 */
-#define TCP_RTO_MIN 0.2 * THZ /* Min RTO (200ms) - lower than standard */
-#define TCP_RTT_MAX 30 * THZ /* 30s (probably too much) */
-#define TCP_RTO_SYN_RETRIES 3 /* SYN retries without doubling RTO */
-#define TCP_RTO_INIT 1 * THZ /* Initial retransmit timer */
-#define TCP_RTO_BOFF_MAX 8 /* Max number of retries before reset */
-#define TCP_ESTABLISH_TIME (60 * THZ) /* Connection establish timeout */
-
-/** Connection configuration flags */
-#define foreach_tcp_cfg_flag \
- _(RATE_SAMPLE, "Rate sampling") \
- _(NO_CSUM_OFFLOAD, "No csum offload") \
- _(NO_TSO, "TSO off") \
- _(TSO, "TSO") \
- _(NO_ENDPOINT,"No endpoint") \
-
-typedef enum tcp_cfg_flag_bits_
-{
-#define _(sym, str) TCP_CFG_F_##sym##_BIT,
- foreach_tcp_cfg_flag
-#undef _
- TCP_CFG_N_FLAG_BITS
-} tcp_cfg_flag_bits_e;
-
-typedef enum tcp_cfg_flag_
-{
-#define _(sym, str) TCP_CFG_F_##sym = 1 << TCP_CFG_F_##sym##_BIT,
- foreach_tcp_cfg_flag
-#undef _
- TCP_CFG_N_FLAGS
-} tcp_cfg_flags_e;
-
-/** TCP connection flags */
-#define foreach_tcp_connection_flag \
- _(SNDACK, "Send ACK") \
- _(FINSNT, "FIN sent") \
- _(RECOVERY, "Recovery") \
- _(FAST_RECOVERY, "Fast Recovery") \
- _(DCNT_PENDING, "Disconnect pending") \
- _(HALF_OPEN_DONE, "Half-open completed") \
- _(FINPNDG, "FIN pending") \
- _(RXT_PENDING, "Retransmit pending") \
- _(FRXT_FIRST, "Retransmit first") \
- _(DEQ_PENDING, "Dequeue pending ") \
- _(PSH_PENDING, "PSH pending") \
- _(FINRCVD, "FIN received") \
- _(ZERO_RWND_SENT, "Zero RWND sent") \
-
-typedef enum tcp_connection_flag_bits_
-{
-#define _(sym, str) TCP_CONN_##sym##_BIT,
- foreach_tcp_connection_flag
-#undef _
- TCP_CONN_N_FLAG_BITS
-} tcp_connection_flag_bits_e;
-
-typedef enum tcp_connection_flag_
-{
-#define _(sym, str) TCP_CONN_##sym = 1 << TCP_CONN_##sym##_BIT,
- foreach_tcp_connection_flag
-#undef _
- TCP_CONN_N_FLAGS
-} tcp_connection_flags_e;
-
-#define TCP_SCOREBOARD_TRACE (0)
-#define TCP_MAX_SACK_BLOCKS 256 /**< Max number of SACK blocks stored */
-#define TCP_INVALID_SACK_HOLE_INDEX ((u32)~0)
-
-typedef struct _scoreboard_trace_elt
-{
- u32 start;
- u32 end;
- u32 ack;
- u32 snd_una_max;
- u32 group;
-} scoreboard_trace_elt_t;
-
-typedef struct _sack_scoreboard_hole
-{
- u32 next; /**< Index for next entry in linked list */
- u32 prev; /**< Index for previous entry in linked list */
- u32 start; /**< Start sequence number */
- u32 end; /**< End sequence number */
- u8 is_lost; /**< Mark hole as lost */
-} sack_scoreboard_hole_t;
-
-typedef struct _sack_scoreboard
-{
- sack_scoreboard_hole_t *holes; /**< Pool of holes */
- u32 head; /**< Index of first entry */
- u32 tail; /**< Index of last entry */
- u32 sacked_bytes; /**< Number of bytes sacked in sb */
- u32 last_sacked_bytes; /**< Number of bytes last sacked */
- u32 last_bytes_delivered; /**< Sack bytes delivered to app */
- u32 rxt_sacked; /**< Rxt bytes last delivered */
- u32 high_sacked; /**< Highest byte sacked (fack) */
- u32 high_rxt; /**< Highest retransmitted sequence */
- u32 rescue_rxt; /**< Rescue sequence number */
- u32 lost_bytes; /**< Bytes lost as per RFC6675 */
- u32 last_lost_bytes; /**< Number of bytes last lost */
- u32 cur_rxt_hole; /**< Retransmitting from this hole */
- u8 is_reneging;
-
-#if TCP_SCOREBOARD_TRACE
- scoreboard_trace_elt_t *trace;
-#endif
-
-} sack_scoreboard_t;
-
-#if TCP_SCOREBOARD_TRACE
-#define tcp_scoreboard_trace_add(_tc, _ack) \
-{ \
- static u64 _group = 0; \
- sack_scoreboard_t *_sb = &_tc->sack_sb; \
- sack_block_t *_sack, *_sacks; \
- scoreboard_trace_elt_t *_elt; \
- int i; \
- _group++; \
- _sacks = _tc->rcv_opts.sacks; \
- for (i = 0; i < vec_len (_sacks); i++) \
- { \
- _sack = &_sacks[i]; \
- vec_add2 (_sb->trace, _elt, 1); \
- _elt->start = _sack->start; \
- _elt->end = _sack->end; \
- _elt->ack = _elt->end == _ack ? _ack : 0; \
- _elt->snd_una_max = _elt->end == _ack ? _tc->snd_una_max : 0; \
- _elt->group = _group; \
- } \
-}
-#else
-#define tcp_scoreboard_trace_add(_tc, _ack)
-#endif
-
-sack_scoreboard_hole_t *scoreboard_next_rxt_hole (sack_scoreboard_t * sb,
- sack_scoreboard_hole_t *
- start, u8 have_sent_1_smss,
- u8 * can_rescue,
- u8 * snd_limited);
-sack_scoreboard_hole_t *scoreboard_get_hole (sack_scoreboard_t * sb,
- u32 index);
-
-sack_scoreboard_hole_t *scoreboard_next_hole (sack_scoreboard_t * sb,
- sack_scoreboard_hole_t * hole);
-sack_scoreboard_hole_t *scoreboard_prev_hole (sack_scoreboard_t * sb,
- sack_scoreboard_hole_t * hole);
-sack_scoreboard_hole_t *scoreboard_first_hole (sack_scoreboard_t * sb);
-sack_scoreboard_hole_t *scoreboard_last_hole (sack_scoreboard_t * sb);
-
-void scoreboard_clear (sack_scoreboard_t * sb);
-void scoreboard_clear_reneging (sack_scoreboard_t * sb, u32 start, u32 end);
-void scoreboard_init (sack_scoreboard_t * sb);
-void scoreboard_init_rxt (sack_scoreboard_t * sb, u32 snd_una);
-u8 *format_tcp_scoreboard (u8 * s, va_list * args);
-
-#define TCP_BTS_INVALID_INDEX ((u32)~0)
-
-typedef enum tcp_bts_flags_
-{
- TCP_BTS_IS_RXT = 1,
- TCP_BTS_IS_APP_LIMITED = 1 << 1,
- TCP_BTS_IS_SACKED = 1 << 2,
- TCP_BTS_IS_RXT_LOST = 1 << 3,
-} __clib_packed tcp_bts_flags_t;
-
-typedef struct tcp_bt_sample_
-{
- u32 next; /**< Next sample index in list */
- u32 prev; /**< Previous sample index in list */
- u32 min_seq; /**< Min seq number in sample */
- u32 max_seq; /**< Max seq number. Set for rxt samples */
- u64 delivered; /**< Total delivered bytes for sample */
- f64 delivered_time; /**< Delivered time when sample taken */
- f64 tx_time; /**< Transmit time for the burst */
- f64 first_tx_time; /**< Connection first tx time at tx */
- u64 tx_in_flight; /**< In flight at tx time */
- u64 tx_lost; /**< Lost at tx time */
- tcp_bts_flags_t flags; /**< Sample flag */
-} tcp_bt_sample_t;
-
-typedef struct tcp_rate_sample_
-{
- u64 prior_delivered; /**< Delivered of sample used for rate, i.e.,
- total bytes delivered at prior_time */
- f64 prior_time; /**< Delivered time of sample used for rate */
- f64 interval_time; /**< Time to ack the bytes delivered */
- f64 rtt_time; /**< RTT for sample */
- u64 tx_in_flight; /**< In flight at (re)transmit time */
- u64 tx_lost; /**< Lost over interval */
- u32 delivered; /**< Bytes delivered in interval_time */
- u32 acked_and_sacked; /**< Bytes acked + sacked now */
- u32 last_lost; /**< Bytes lost now */
- u32 lost; /**< Number of bytes lost over interval */
- tcp_bts_flags_t flags; /**< Rate sample flags from bt sample */
-} tcp_rate_sample_t;
-
-typedef struct tcp_byte_tracker_
-{
- tcp_bt_sample_t *samples; /**< Pool of samples */
- rb_tree_t sample_lookup; /**< Rbtree for sample lookup by min_seq */
- u32 head; /**< Head of samples linked list */
- u32 tail; /**< Tail of samples linked list */
- u32 last_ooo; /**< Cached last ooo sample */
-} tcp_byte_tracker_t;
-
-typedef enum _tcp_cc_algorithm_type
-{
- TCP_CC_NEWRENO,
- TCP_CC_CUBIC,
- TCP_CC_LAST = TCP_CC_CUBIC
-} tcp_cc_algorithm_type_e;
-
-typedef struct _tcp_cc_algorithm tcp_cc_algorithm_t;
-
-typedef enum _tcp_cc_ack_t
-{
- TCP_CC_ACK,
- TCP_CC_DUPACK,
- TCP_CC_PARTIALACK
-} tcp_cc_ack_t;
-
-typedef enum tcp_cc_event_
-{
- TCP_CC_EVT_START_TX,
-} tcp_cc_event_t;
-
-/*
- * As per RFC4898 tcpEStatsStackSoftErrors
- */
-typedef struct tcp_errors_
-{
- u32 below_data_wnd; /**< All data in seg is below snd_una */
- u32 above_data_wnd; /**< Some data in segment is above snd_wnd */
- u32 below_ack_wnd; /**< Acks for data below snd_una */
- u32 above_ack_wnd; /**< Acks for data not sent */
-} tcp_errors_t;
-
-typedef struct _tcp_connection
-{
- CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
- transport_connection_t connection; /**< Common transport data. First! */
-
- u8 state; /**< TCP state as per tcp_state_t */
- u8 cfg_flags; /**< Connection configuration flags */
- u16 flags; /**< Connection flags (see tcp_conn_flags_e) */
- u32 timers[TCP_N_TIMERS]; /**< Timer handles into timer wheel */
-
- u64 segs_in; /** RFC4022/4898 tcpHCInSegs/tcpEStatsPerfSegsIn */
- u64 bytes_in; /** RFC4898 tcpEStatsPerfHCDataOctetsIn */
- u64 segs_out; /** RFC4898 tcpEStatsPerfSegsOut */
- u64 bytes_out; /** RFC4898 tcpEStatsPerfHCDataOctetsOut */
-
- /** Send sequence variables RFC793 */
- u32 snd_una; /**< oldest unacknowledged sequence number */
- u32 snd_una_max; /**< newest unacknowledged sequence number + 1*/
- u32 snd_wnd; /**< send window */
- u32 snd_wl1; /**< seq number used for last snd.wnd update */
- u32 snd_wl2; /**< ack number used for last snd.wnd update */
- u32 snd_nxt; /**< next seq number to be sent */
- u16 snd_mss; /**< Effective send max seg (data) size */
-
- u64 data_segs_in; /** RFC4898 tcpEStatsPerfDataSegsIn */
- u64 data_segs_out; /** RFC4898 tcpEStatsPerfDataSegsOut */
-
- /** Receive sequence variables RFC793 */
- u32 rcv_nxt; /**< next sequence number expected */
- u32 rcv_wnd; /**< receive window we expect */
-
- u32 rcv_las; /**< rcv_nxt at last ack sent/rcv_wnd update */
- u32 iss; /**< initial sent sequence */
- u32 irs; /**< initial remote sequence */
-
- /* Options */
- u8 snd_opts_len; /**< Tx options len */
- u8 rcv_wscale; /**< Window scale to advertise to peer */
- u8 snd_wscale; /**< Window scale to use when sending */
- u32 tsval_recent; /**< Last timestamp received */
- u32 tsval_recent_age; /**< When last updated tstamp_recent*/
- tcp_options_t snd_opts; /**< Tx options for connection */
- tcp_options_t rcv_opts; /**< Rx options for connection */
-
- sack_block_t *snd_sacks; /**< Vector of SACKs to send. XXX Fixed size? */
- u8 snd_sack_pos; /**< Position in vec of first block to send */
- sack_block_t *snd_sacks_fl; /**< Vector for building new list */
- sack_scoreboard_t sack_sb; /**< SACK "scoreboard" that tracks holes */
-
- u16 rcv_dupacks; /**< Number of recent DUPACKs received */
- u32 dupacks_in; /**< RFC4898 tcpEStatsStackDupAcksIn*/
- u8 pending_dupacks; /**< Number of DUPACKs to be sent */
- u32 dupacks_out; /**< RFC4898 tcpEStatsPathDupAcksOut */
-
- /* Congestion control */
- u32 cwnd; /**< Congestion window */
- u32 cwnd_acc_bytes; /**< Bytes accumulated for cwnd increment */
- u32 ssthresh; /**< Slow-start threshold */
- u32 prev_ssthresh; /**< ssthresh before congestion */
- u32 prev_cwnd; /**< ssthresh before congestion */
- u32 bytes_acked; /**< Bytes acknowledged by current segment */
- u32 burst_acked; /**< Bytes acknowledged in current burst */
- u32 snd_rxt_bytes; /**< Retransmitted bytes during current cc event */
- u32 snd_rxt_ts; /**< Timestamp when first packet is retransmitted */
- u32 prr_delivered; /**< RFC6937 bytes delivered during current event */
- u32 prr_start; /**< snd_una when prr starts */
- u32 rxt_delivered; /**< Rxt bytes delivered during current cc event */
- u32 rxt_head; /**< snd_una last time we re rxted the head */
- u32 tsecr_last_ack; /**< Timestamp echoed to us in last healthy ACK */
- u32 snd_congestion; /**< snd_una_max when congestion is detected */
- u32 tx_fifo_size; /**< Tx fifo size. Used to constrain cwnd */
- tcp_cc_algorithm_t *cc_algo; /**< Congestion control algorithm */
- u8 cc_data[TCP_CC_DATA_SZ]; /**< Congestion control algo private data */
-
- u32 fr_occurences; /**< fast-retransmit occurrences RFC4898
- tcpEStatsStackFastRetran */
- u32 tr_occurences; /**< timer-retransmit occurrences */
- u64 bytes_retrans; /**< RFC4898 tcpEStatsPerfOctetsRetrans */
- u64 segs_retrans; /**< RFC4898 tcpEStatsPerfSegsRetrans*/
-
- /* RTT and RTO */
- u32 rto; /**< Retransmission timeout */
- u32 rto_boff; /**< Index for RTO backoff */
- u32 srtt; /**< Smoothed RTT */
- u32 rttvar; /**< Smoothed mean RTT difference. Approximates variance */
- u32 rtt_seq; /**< Sequence number for tracked ACK */
- f64 rtt_ts; /**< Timestamp for tracked ACK */
- f64 mrtt_us; /**< High precision mrtt from tracked acks */
-
- u32 psh_seq; /**< Add psh header for seg that includes this */
- u32 next_node_index; /**< Can be used to control next node in output */
- u32 next_node_opaque; /**< Opaque to pass to next node */
- u32 limited_transmit; /**< snd_nxt when limited transmit starts */
- u32 sw_if_index; /**< Interface for the connection */
-
- /* Delivery rate estimation */
- u64 delivered; /**< Total bytes delivered to peer */
- u64 app_limited; /**< Delivered when app-limited detected */
- f64 delivered_time; /**< Time last bytes were acked */
- f64 first_tx_time; /**< Send time for recently delivered/sent */
- u64 lost; /**< Total bytes lost */
- tcp_byte_tracker_t *bt; /**< Tx byte tracker */
-
- tcp_errors_t errors; /**< Soft connection errors */
-
- f64 start_ts; /**< Timestamp when connection initialized */
- u32 last_fib_check; /**< Last time we checked fib route for peer */
- u16 mss; /**< Our max seg size that includes options */
- u32 timestamp_delta; /**< Offset for timestamp */
- u32 ipv6_flow_label; /**< flow label for ipv6 header */
-
-#define rst_state snd_wl1
-} tcp_connection_t;
-
-/* *INDENT-OFF* */
-struct _tcp_cc_algorithm
-{
- const char *name;
- uword (*unformat_cfg) (unformat_input_t * input);
- void (*init) (tcp_connection_t * tc);
- void (*cleanup) (tcp_connection_t * tc);
- void (*rcv_ack) (tcp_connection_t * tc, tcp_rate_sample_t *rs);
- void (*rcv_cong_ack) (tcp_connection_t * tc, tcp_cc_ack_t ack,
- tcp_rate_sample_t *rs);
- void (*congestion) (tcp_connection_t * tc);
- void (*loss) (tcp_connection_t * tc);
- void (*recovered) (tcp_connection_t * tc);
- void (*undo_recovery) (tcp_connection_t * tc);
- void (*event) (tcp_connection_t *tc, tcp_cc_event_t evt);
- u64 (*get_pacing_rate) (tcp_connection_t *tc);
-};
-/* *INDENT-ON* */
-
-#define tcp_fastrecovery_on(tc) (tc)->flags |= TCP_CONN_FAST_RECOVERY
-#define tcp_fastrecovery_off(tc) (tc)->flags &= ~TCP_CONN_FAST_RECOVERY
-#define tcp_recovery_on(tc) (tc)->flags |= TCP_CONN_RECOVERY
-#define tcp_recovery_off(tc) (tc)->flags &= ~TCP_CONN_RECOVERY
-#define tcp_in_fastrecovery(tc) ((tc)->flags & TCP_CONN_FAST_RECOVERY)
-#define tcp_in_recovery(tc) ((tc)->flags & (TCP_CONN_RECOVERY))
-#define tcp_in_slowstart(tc) (tc->cwnd < tc->ssthresh)
-#define tcp_disconnect_pending(tc) ((tc)->flags & TCP_CONN_DCNT_PENDING)
-#define tcp_disconnect_pending_on(tc) ((tc)->flags |= TCP_CONN_DCNT_PENDING)
-#define tcp_disconnect_pending_off(tc) ((tc)->flags &= ~TCP_CONN_DCNT_PENDING)
-#define tcp_fastrecovery_first(tc) ((tc)->flags & TCP_CONN_FRXT_FIRST)
-#define tcp_fastrecovery_first_on(tc) ((tc)->flags |= TCP_CONN_FRXT_FIRST)
-#define tcp_fastrecovery_first_off(tc) ((tc)->flags &= ~TCP_CONN_FRXT_FIRST)
-
-#define tcp_in_cong_recovery(tc) ((tc)->flags & \
- (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY))
-
-#define tcp_csum_offload(tc) (!((tc)->cfg_flags & TCP_CFG_F_NO_CSUM_OFFLOAD))
+#include <vnet/tcp/tcp_sack.h>
+#include <vnet/tcp/tcp_bt.h>
+#include <vnet/tcp/tcp_cc.h>
typedef void (timer_expiration_handler) (tcp_connection_t * tc);
@@ -484,17 +33,6 @@
extern timer_expiration_handler tcp_timer_persist_handler;
extern timer_expiration_handler tcp_timer_retransmit_syn_handler;
-always_inline void
-tcp_cong_recovery_off (tcp_connection_t * tc)
-{
- tc->flags &= ~(TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY);
- tcp_fastrecovery_first_off (tc);
-}
-
-#define tcp_zero_rwnd_sent(tc) ((tc)->flags & TCP_CONN_ZERO_RWND_SENT)
-#define tcp_zero_rwnd_sent_on(tc) (tc)->flags |= TCP_CONN_ZERO_RWND_SENT
-#define tcp_zero_rwnd_sent_off(tc) (tc)->flags &= ~TCP_CONN_ZERO_RWND_SENT
-
typedef enum _tcp_error
{
#define tcp_error(n,s) TCP_ERROR_##n,
@@ -752,14 +290,6 @@
return &tcp_main.wrk_ctx[thread_index];
}
-always_inline tcp_header_t *
-tcp_buffer_hdr (vlib_buffer_t * b)
-{
- ASSERT ((signed) b->current_data >= (signed) -VLIB_BUFFER_PRE_DATA_SIZE);
- return (tcp_header_t *) (b->data + b->current_data
- + vnet_buffer (b)->tcp.hdr_offset);
-}
-
#if (VLIB_BUFFER_TRACE_TRAJECTORY)
#define tcp_trajectory_add_start(b, start) \
{ \
@@ -769,84 +299,15 @@
#define tcp_trajectory_add_start(b, start)
#endif
-clib_error_t *vnet_tcp_enable_disable (vlib_main_t * vm, u8 is_en);
-
-void tcp_punt_unknown (vlib_main_t * vm, u8 is_ip4, u8 is_add);
-
-always_inline tcp_connection_t *
-tcp_connection_get (u32 conn_index, u32 thread_index)
-{
- tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index);
- if (PREDICT_FALSE (pool_is_free_index (wrk->connections, conn_index)))
- return 0;
- return pool_elt_at_index (wrk->connections, conn_index);
-}
-
-always_inline tcp_connection_t *
-tcp_connection_get_if_valid (u32 conn_index, u32 thread_index)
-{
- tcp_worker_ctx_t *wrk;
- if (thread_index >= vec_len (tcp_main.wrk_ctx))
- return 0;
- wrk = tcp_get_worker (thread_index);
- if (pool_is_free_index (wrk->connections, conn_index))
- return 0;
- return pool_elt_at_index (wrk->connections, conn_index);
-}
-
-always_inline tcp_connection_t *
-tcp_get_connection_from_transport (transport_connection_t * tconn)
-{
- return (tcp_connection_t *) tconn;
-}
-
-always_inline void
-tcp_connection_set_state (tcp_connection_t * tc, tcp_state_t state)
-{
- tc->state = state;
- TCP_EVT (TCP_EVT_STATE_CHANGE, tc);
-}
-
-void tcp_connection_close (tcp_connection_t * tc);
-void tcp_connection_cleanup (tcp_connection_t * tc);
-void tcp_connection_del (tcp_connection_t * tc);
-int tcp_half_open_connection_cleanup (tcp_connection_t * tc);
tcp_connection_t *tcp_connection_alloc (u8 thread_index);
tcp_connection_t *tcp_connection_alloc_w_base (u8 thread_index,
tcp_connection_t * base);
void tcp_connection_free (tcp_connection_t * tc);
-int tcp_configure_v4_source_address_range (vlib_main_t * vm,
- ip4_address_t * start,
- ip4_address_t * end, u32 table_id);
-int tcp_configure_v6_source_address_range (vlib_main_t * vm,
- ip6_address_t * start,
- ip6_address_t * end, u32 table_id);
-void tcp_api_reference (void);
-u8 *format_tcp_connection (u8 * s, va_list * args);
-u8 *format_tcp_connection_id (u8 * s, va_list * args);
+void tcp_connection_close (tcp_connection_t * tc);
+void tcp_connection_cleanup (tcp_connection_t * tc);
+void tcp_connection_del (tcp_connection_t * tc);
+int tcp_half_open_connection_cleanup (tcp_connection_t * tc);
-always_inline tcp_connection_t *
-tcp_listener_get (u32 tli)
-{
- tcp_connection_t *tc = 0;
- if (!pool_is_free_index (tcp_main.listener_pool, tli))
- tc = pool_elt_at_index (tcp_main.listener_pool, tli);
- return tc;
-}
-
-always_inline tcp_connection_t *
-tcp_half_open_connection_get (u32 conn_index)
-{
- tcp_connection_t *tc = 0;
- clib_spinlock_lock_if_init (&tcp_main.half_open_lock);
- if (!pool_is_free_index (tcp_main.half_open_connections, conn_index))
- tc = pool_elt_at_index (tcp_main.half_open_connections, conn_index);
- clib_spinlock_unlock_if_init (&tcp_main.half_open_lock);
- return tc;
-}
-
-void tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b);
-void tcp_make_synack (tcp_connection_t * ts, vlib_buffer_t * b);
void tcp_send_reset_w_pkt (tcp_connection_t * tc, vlib_buffer_t * pkt,
u32 thread_index, u8 is_ip4);
void tcp_send_reset (tcp_connection_t * tc);
@@ -854,256 +315,17 @@
void tcp_send_synack (tcp_connection_t * tc);
void tcp_send_fin (tcp_connection_t * tc);
void tcp_send_ack (tcp_connection_t * tc);
-void tcp_update_burst_snd_vars (tcp_connection_t * tc);
-void tcp_update_rto (tcp_connection_t * tc);
void tcp_send_window_update_ack (tcp_connection_t * tc);
void tcp_program_ack (tcp_connection_t * tc);
void tcp_program_dupack (tcp_connection_t * tc);
void tcp_program_retransmit (tcp_connection_t * tc);
-/*
- * Rate estimation
- */
-
-/**
- * Byte tracker initialize
- *
- * @param tc connection for which the byte tracker should be allocated and
- * initialized
- */
-void tcp_bt_init (tcp_connection_t * tc);
-/**
- * Byte tracker cleanup
- *
- * @param tc connection for which the byte tracker should be cleaned up
- */
-void tcp_bt_cleanup (tcp_connection_t * tc);
-/**
- * Flush byte tracker samples
- *
- * @param tc tcp connection for which samples should be flushed
- */
-void tcp_bt_flush_samples (tcp_connection_t * tc);
-/**
- * Track a tcp tx burst
- *
- * @param tc tcp connection
- */
-void tcp_bt_track_tx (tcp_connection_t * tc, u32 len);
-/**
- * Track a tcp retransmission
- *
- * @param tc tcp connection
- * @param start start sequence number
- * @param end end sequence number
- */
-void tcp_bt_track_rxt (tcp_connection_t * tc, u32 start, u32 end);
-/**
- * Generate a delivery rate sample from recently acked bytes
- *
- * @param tc tcp connection
- * @param rs resulting rate sample
- */
-void tcp_bt_sample_delivery_rate (tcp_connection_t * tc,
- tcp_rate_sample_t * rs);
-/**
- * Check if sample to be generated is app limited
- *
- * @param tc tcp connection
- */
-void tcp_bt_check_app_limited (tcp_connection_t * tc);
-/**
- * Check if the byte tracker is in sane state
- *
- * Should be used only for testing
- *
- * @param bt byte tracker
- */
-int tcp_bt_is_sane (tcp_byte_tracker_t * bt);
-u8 *format_tcp_bt (u8 * s, va_list * args);
-
-always_inline u32
-tcp_end_seq (tcp_header_t * th, u32 len)
-{
- return th->seq_number + tcp_is_syn (th) + tcp_is_fin (th) + len;
-}
-
-/* Modulo arithmetic for TCP sequence numbers */
-#define seq_lt(_s1, _s2) ((i32)((_s1)-(_s2)) < 0)
-#define seq_leq(_s1, _s2) ((i32)((_s1)-(_s2)) <= 0)
-#define seq_gt(_s1, _s2) ((i32)((_s1)-(_s2)) > 0)
-#define seq_geq(_s1, _s2) ((i32)((_s1)-(_s2)) >= 0)
-#define seq_max(_s1, _s2) (seq_gt((_s1), (_s2)) ? (_s1) : (_s2))
-
-/* Modulo arithmetic for timestamps */
-#define timestamp_lt(_t1, _t2) ((i32)((_t1)-(_t2)) < 0)
-#define timestamp_leq(_t1, _t2) ((i32)((_t1)-(_t2)) <= 0)
-
-/**
- * Our estimate of the number of bytes that have left the network
- */
-always_inline u32
-tcp_bytes_out (const tcp_connection_t * tc)
-{
- if (tcp_opts_sack_permitted (&tc->rcv_opts))
- return tc->sack_sb.sacked_bytes + tc->sack_sb.lost_bytes;
- else
- return clib_min (tc->rcv_dupacks * tc->snd_mss,
- tc->snd_nxt - tc->snd_una);
-}
-
-/**
- * Our estimate of the number of bytes in flight (pipe size)
- */
-always_inline u32
-tcp_flight_size (const tcp_connection_t * tc)
-{
- int flight_size;
-
- flight_size = (int) (tc->snd_nxt - tc->snd_una) - tcp_bytes_out (tc)
- + tc->snd_rxt_bytes - tc->rxt_delivered;
-
- ASSERT (flight_size >= 0);
-
- return flight_size;
-}
-
-/**
- * Initial cwnd as per RFC5681
- */
-always_inline u32
-tcp_initial_cwnd (const tcp_connection_t * tc)
-{
- if (tcp_cfg.initial_cwnd_multiplier > 0)
- return tcp_cfg.initial_cwnd_multiplier * tc->snd_mss;
-
- if (tc->snd_mss > 2190)
- return 2 * tc->snd_mss;
- else if (tc->snd_mss > 1095)
- return 3 * tc->snd_mss;
- else
- return 4 * tc->snd_mss;
-}
-
-/*
- * Accumulate acked bytes for cwnd increase
- *
- * Once threshold bytes are accumulated, snd_mss bytes are added
- * to the cwnd.
- */
-always_inline void
-tcp_cwnd_accumulate (tcp_connection_t * tc, u32 thresh, u32 bytes)
-{
- tc->cwnd_acc_bytes += bytes;
- if (tc->cwnd_acc_bytes >= thresh)
- {
- u32 inc = tc->cwnd_acc_bytes / thresh;
- tc->cwnd_acc_bytes -= inc * thresh;
- tc->cwnd += inc * tc->snd_mss;
- tc->cwnd = clib_min (tc->cwnd, tc->tx_fifo_size);
- }
-}
-
-always_inline u32
-tcp_loss_wnd (const tcp_connection_t * tc)
-{
- /* Whatever we have in flight + the packet we're about to send */
- return tcp_flight_size (tc) + tc->snd_mss;
-}
-
-always_inline u32
-tcp_available_snd_wnd (const tcp_connection_t * tc)
-{
- return clib_min (tc->cwnd, tc->snd_wnd);
-}
-
-always_inline u32
-tcp_available_output_snd_space (const tcp_connection_t * tc)
-{
- u32 available_wnd = tcp_available_snd_wnd (tc);
- int flight_size = (int) (tc->snd_nxt - tc->snd_una);
-
- if (available_wnd <= flight_size)
- return 0;
-
- return available_wnd - flight_size;
-}
-
-/**
- * Estimate of how many bytes we can still push into the network
- */
-always_inline u32
-tcp_available_cc_snd_space (const tcp_connection_t * tc)
-{
- u32 available_wnd = tcp_available_snd_wnd (tc);
- u32 flight_size = tcp_flight_size (tc);
-
- if (available_wnd <= flight_size)
- return 0;
-
- return available_wnd - flight_size;
-}
-
-static inline u8
-tcp_is_descheduled (tcp_connection_t * tc)
-{
- return (transport_connection_is_descheduled (&tc->connection) ? 1 : 0);
-}
-
-always_inline u8
-tcp_is_lost_fin (tcp_connection_t * tc)
-{
- if ((tc->flags & TCP_CONN_FINSNT) && (tc->snd_una_max - tc->snd_una == 1))
- return 1;
- return 0;
-}
-
+void tcp_update_burst_snd_vars (tcp_connection_t * tc);
u32 tcp_snd_space (tcp_connection_t * tc);
int tcp_fastrecovery_prr_snd_space (tcp_connection_t * tc);
void tcp_reschedule (tcp_connection_t * tc);
-
fib_node_index_t tcp_lookup_rmt_in_fib (tcp_connection_t * tc);
-
-/* Made public for unit testing only */
-void tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end);
-u32 tcp_sack_list_bytes (tcp_connection_t * tc);
-
-always_inline u32
-tcp_time_now (void)
-{
- return tcp_main.wrk_ctx[vlib_get_thread_index ()].time_now;
-}
-
-always_inline u32
-tcp_time_now_w_thread (u32 thread_index)
-{
- return tcp_main.wrk_ctx[thread_index].time_now;
-}
-
-/**
- * Generate timestamp for tcp connection
- */
-always_inline u32
-tcp_tstamp (tcp_connection_t * tc)
-{
- return (tcp_main.wrk_ctx[tc->c_thread_index].time_now -
- tc->timestamp_delta);
-}
-
-always_inline f64
-tcp_time_now_us (u32 thread_index)
-{
- return transport_time_now (thread_index);
-}
-
-always_inline u32
-tcp_set_time_now (tcp_worker_ctx_t * wrk)
-{
- wrk->time_now = clib_cpu_time_now () * tcp_main.tstamp_ticks_per_clock;
- return wrk->time_now;
-}
-
u32 tcp_session_push_header (transport_connection_t * tconn,
vlib_buffer_t * b);
int tcp_session_custom_tx (void *conn, u32 max_burst_size);
@@ -1117,64 +339,22 @@
u32 start_bucket);
void tcp_program_cleanup (tcp_worker_ctx_t * wrk, tcp_connection_t * tc);
-always_inline void
-tcp_cc_rcv_ack (tcp_connection_t * tc, tcp_rate_sample_t * rs)
-{
- tc->cc_algo->rcv_ack (tc, rs);
- tc->tsecr_last_ack = tc->rcv_opts.tsecr;
-}
+void tcp_punt_unknown (vlib_main_t * vm, u8 is_ip4, u8 is_add);
+int tcp_configure_v4_source_address_range (vlib_main_t * vm,
+ ip4_address_t * start,
+ ip4_address_t * end, u32 table_id);
+int tcp_configure_v6_source_address_range (vlib_main_t * vm,
+ ip6_address_t * start,
+ ip6_address_t * end, u32 table_id);
-static inline void
-tcp_cc_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type,
- tcp_rate_sample_t * rs)
-{
- tc->cc_algo->rcv_cong_ack (tc, ack_type, rs);
-}
+clib_error_t *vnet_tcp_enable_disable (vlib_main_t * vm, u8 is_en);
-static inline void
-tcp_cc_congestion (tcp_connection_t * tc)
-{
- tc->cc_algo->congestion (tc);
-}
-
-static inline void
-tcp_cc_loss (tcp_connection_t * tc)
-{
- tc->cc_algo->loss (tc);
-}
-
-static inline void
-tcp_cc_recovered (tcp_connection_t * tc)
-{
- tc->cc_algo->recovered (tc);
-}
-
-static inline void
-tcp_cc_undo_recovery (tcp_connection_t * tc)
-{
- if (tc->cc_algo->undo_recovery)
- tc->cc_algo->undo_recovery (tc);
-}
-
-static inline void
-tcp_cc_event (tcp_connection_t * tc, tcp_cc_event_t evt)
-{
- if (tc->cc_algo->event)
- tc->cc_algo->event (tc, evt);
-}
-
-static inline u64
-tcp_cc_get_pacing_rate (tcp_connection_t * tc)
-{
- if (tc->cc_algo->get_pacing_rate)
- return tc->cc_algo->get_pacing_rate (tc);
-
- f64 srtt = clib_min ((f64) tc->srtt * TCP_TICK, tc->mrtt_us);
-
- /* TODO should constrain to interface's max throughput but
- * we don't have link speeds for sw ifs ..*/
- return ((f64) tc->cwnd / srtt);
-}
+format_function_t format_tcp_state;
+format_function_t format_tcp_flags;
+format_function_t format_tcp_sacks;
+format_function_t format_tcp_rcv_sacks;
+format_function_t format_tcp_connection;
+format_function_t format_tcp_connection_id;
always_inline void
tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval)
@@ -1287,101 +467,6 @@
ASSERT(_tc->state != TCP_STATE_ESTABLISHED \
|| transport_max_tx_dequeue (&_tc->connection) >= _a)
-void tcp_rcv_sacks (tcp_connection_t * tc, u32 ack);
-u8 *tcp_scoreboard_replay (u8 * s, tcp_connection_t * tc, u8 verbose);
-
-/**
- * Register exiting cc algo type
- */
-void tcp_cc_algo_register (tcp_cc_algorithm_type_e type,
- const tcp_cc_algorithm_t * vft);
-
-/**
- * Register new cc algo type
- */
-tcp_cc_algorithm_type_e tcp_cc_algo_new_type (const tcp_cc_algorithm_t * vft);
-tcp_cc_algorithm_t *tcp_cc_algo_get (tcp_cc_algorithm_type_e type);
-
-static inline void *
-tcp_cc_data (tcp_connection_t * tc)
-{
- return (void *) tc->cc_data;
-}
-
-void newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type,
- tcp_rate_sample_t * rs);
-/**
- * Initialize connection by gleaning network and rcv params from buffer
- *
- * @param tc connection to initialize
- * @param b buffer whose current data is pointing at ip
- * @param is_ip4 flag set to 1 if using ip4
- */
-void tcp_init_w_buffer (tcp_connection_t * tc, vlib_buffer_t * b, u8 is_ip4);
-
-/**
- * Push TCP header to buffer
- *
- * @param vm - vlib_main
- * @param b - buffer to write the header to
- * @param sp_net - source port net order
- * @param dp_net - destination port net order
- * @param seq - sequence number net order
- * @param ack - ack number net order
- * @param tcp_hdr_opts_len - header and options length in bytes
- * @param flags - header flags
- * @param wnd - window size
- *
- * @return - pointer to start of TCP header
- */
-always_inline void *
-vlib_buffer_push_tcp_net_order (vlib_buffer_t * b, u16 sp, u16 dp, u32 seq,
- u32 ack, u8 tcp_hdr_opts_len, u8 flags,
- u16 wnd)
-{
- tcp_header_t *th;
-
- th = vlib_buffer_push_uninit (b, tcp_hdr_opts_len);
-
- th->src_port = sp;
- th->dst_port = dp;
- th->seq_number = seq;
- th->ack_number = ack;
- th->data_offset_and_reserved = (tcp_hdr_opts_len >> 2) << 4;
- th->flags = flags;
- th->window = wnd;
- th->checksum = 0;
- th->urgent_pointer = 0;
- vnet_buffer (b)->l4_hdr_offset = (u8 *) th - b->data;
- b->flags |= VNET_BUFFER_F_L4_HDR_OFFSET_VALID;
- return th;
-}
-
-/**
- * Push TCP header to buffer
- *
- * @param b - buffer to write the header to
- * @param sp_net - source port net order
- * @param dp_net - destination port net order
- * @param seq - sequence number host order
- * @param ack - ack number host order
- * @param tcp_hdr_opts_len - header and options length in bytes
- * @param flags - header flags
- * @param wnd - window size
- *
- * @return - pointer to start of TCP header
- */
-always_inline void *
-vlib_buffer_push_tcp (vlib_buffer_t * b, u16 sp_net, u16 dp_net, u32 seq,
- u32 ack, u8 tcp_hdr_opts_len, u8 flags, u16 wnd)
-{
- return vlib_buffer_push_tcp_net_order (b, sp_net, dp_net,
- clib_host_to_net_u32 (seq),
- clib_host_to_net_u32 (ack),
- tcp_hdr_opts_len, flags,
- clib_host_to_net_u16 (wnd));
-}
-
#endif /* _vnet_tcp_h_ */
/*
diff --git a/src/vnet/tcp/tcp_api.c b/src/vnet/tcp/tcp_api.c
index ac4314f..8b169f8 100644
--- a/src/vnet/tcp/tcp_api.c
+++ b/src/vnet/tcp/tcp_api.c
@@ -115,11 +115,6 @@
VLIB_API_INIT_FUNCTION (tcp_api_hookup);
-void
-tcp_api_reference (void)
-{
-}
-
/*
* fd.io coding-style-patch-verification: ON
*
diff --git a/src/vnet/tcp/tcp_bt.c b/src/vnet/tcp/tcp_bt.c
index e8dc5c9..6f9ee01 100644
--- a/src/vnet/tcp/tcp_bt.c
+++ b/src/vnet/tcp/tcp_bt.c
@@ -16,7 +16,9 @@
* draft-cheng-iccrg-delivery-rate-estimation-00
*/
+#include <vnet/tcp/tcp_bt.h>
#include <vnet/tcp/tcp.h>
+#include <vnet/tcp/tcp_inlines.h>
static tcp_bt_sample_t *
bt_get_sample (tcp_byte_tracker_t * bt, u32 bts_index)
diff --git a/src/vnet/tcp/tcp_bt.h b/src/vnet/tcp/tcp_bt.h
new file mode 100644
index 0000000..b9d0e57
--- /dev/null
+++ b/src/vnet/tcp/tcp_bt.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2020 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Rate estimation
+ */
+
+#ifndef SRC_VNET_TCP_TCP_BT_H_
+#define SRC_VNET_TCP_TCP_BT_H_
+
+#include <vnet/tcp/tcp_types.h>
+
+/**
+ * Byte tracker initialize
+ *
+ * @param tc connection for which the byte tracker should be allocated and
+ * initialized
+ */
+void tcp_bt_init (tcp_connection_t * tc);
+/**
+ * Byte tracker cleanup
+ *
+ * @param tc connection for which the byte tracker should be cleaned up
+ */
+void tcp_bt_cleanup (tcp_connection_t * tc);
+/**
+ * Flush byte tracker samples
+ *
+ * @param tc tcp connection for which samples should be flushed
+ */
+void tcp_bt_flush_samples (tcp_connection_t * tc);
+/**
+ * Track a tcp tx burst
+ *
+ * @param tc tcp connection
+ */
+void tcp_bt_track_tx (tcp_connection_t * tc, u32 len);
+/**
+ * Track a tcp retransmission
+ *
+ * @param tc tcp connection
+ * @param start start sequence number
+ * @param end end sequence number
+ */
+void tcp_bt_track_rxt (tcp_connection_t * tc, u32 start, u32 end);
+/**
+ * Generate a delivery rate sample from recently acked bytes
+ *
+ * @param tc tcp connection
+ * @param rs resulting rate sample
+ */
+void tcp_bt_sample_delivery_rate (tcp_connection_t * tc,
+ tcp_rate_sample_t * rs);
+/**
+ * Check if sample to be generated is app limited
+ *
+ * @param tc tcp connection
+ */
+void tcp_bt_check_app_limited (tcp_connection_t * tc);
+/**
+ * Check if the byte tracker is in sane state
+ *
+ * Should be used only for testing
+ *
+ * @param bt byte tracker
+ */
+int tcp_bt_is_sane (tcp_byte_tracker_t * bt);
+
+format_function_t format_tcp_bt;
+
+#endif /* SRC_VNET_TCP_TCP_BT_H_ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/tcp/tcp_cc.h b/src/vnet/tcp/tcp_cc.h
new file mode 100644
index 0000000..54d2dc6
--- /dev/null
+++ b/src/vnet/tcp/tcp_cc.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2020 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SRC_VNET_TCP_TCP_CC_H_
+#define SRC_VNET_TCP_TCP_CC_H_
+
+#include <vnet/tcp/tcp_types.h>
+
+always_inline void
+tcp_cc_rcv_ack (tcp_connection_t * tc, tcp_rate_sample_t * rs)
+{
+ tc->cc_algo->rcv_ack (tc, rs);
+ tc->tsecr_last_ack = tc->rcv_opts.tsecr;
+}
+
+static inline void
+tcp_cc_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type,
+ tcp_rate_sample_t * rs)
+{
+ tc->cc_algo->rcv_cong_ack (tc, ack_type, rs);
+}
+
+static inline void
+tcp_cc_congestion (tcp_connection_t * tc)
+{
+ tc->cc_algo->congestion (tc);
+}
+
+static inline void
+tcp_cc_loss (tcp_connection_t * tc)
+{
+ tc->cc_algo->loss (tc);
+}
+
+static inline void
+tcp_cc_recovered (tcp_connection_t * tc)
+{
+ tc->cc_algo->recovered (tc);
+}
+
+static inline void
+tcp_cc_undo_recovery (tcp_connection_t * tc)
+{
+ if (tc->cc_algo->undo_recovery)
+ tc->cc_algo->undo_recovery (tc);
+}
+
+static inline void
+tcp_cc_event (tcp_connection_t * tc, tcp_cc_event_t evt)
+{
+ if (tc->cc_algo->event)
+ tc->cc_algo->event (tc, evt);
+}
+
+static inline u64
+tcp_cc_get_pacing_rate (tcp_connection_t * tc)
+{
+ if (tc->cc_algo->get_pacing_rate)
+ return tc->cc_algo->get_pacing_rate (tc);
+
+ f64 srtt = clib_min ((f64) tc->srtt * TCP_TICK, tc->mrtt_us);
+
+ /* TODO should constrain to interface's max throughput but
+ * we don't have link speeds for sw ifs ..*/
+ return ((f64) tc->cwnd / srtt);
+}
+
+static inline void *
+tcp_cc_data (tcp_connection_t * tc)
+{
+ return (void *) tc->cc_data;
+}
+
+/**
+ * Register exiting cc algo type
+ */
+void tcp_cc_algo_register (tcp_cc_algorithm_type_e type,
+ const tcp_cc_algorithm_t * vft);
+
+/**
+ * Register new cc algo type
+ */
+tcp_cc_algorithm_type_e tcp_cc_algo_new_type (const tcp_cc_algorithm_t * vft);
+tcp_cc_algorithm_t *tcp_cc_algo_get (tcp_cc_algorithm_type_e type);
+
+
+void newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type,
+ tcp_rate_sample_t * rs);
+
+
+#endif /* SRC_VNET_TCP_TCP_CC_H_ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/tcp/tcp_cli.c b/src/vnet/tcp/tcp_cli.c
new file mode 100644
index 0000000..a28e2c8
--- /dev/null
+++ b/src/vnet/tcp/tcp_cli.c
@@ -0,0 +1,1030 @@
+/*
+ * Copyright (c) 2020 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/tcp/tcp.h>
+#include <vnet/tcp/tcp_inlines.h>
+#include <vnet/dpo/receive_dpo.h>
+#include <vnet/ip-neighbor/ip_neighbor.h>
+
+const char *tcp_fsm_states[] = {
+#define _(sym, str) str,
+ foreach_tcp_fsm_state
+#undef _
+};
+
+u8 *
+format_tcp_state (u8 * s, va_list * args)
+{
+ u32 state = va_arg (*args, u32);
+
+ if (state < TCP_N_STATES)
+ s = format (s, "%s", tcp_fsm_states[state]);
+ else
+ s = format (s, "UNKNOWN (%d (0x%x))", state, state);
+ return s;
+}
+
+const char *tcp_cfg_flags_str[] = {
+#define _(sym, str) str,
+ foreach_tcp_cfg_flag
+#undef _
+};
+
+static u8 *
+format_tcp_cfg_flags (u8 * s, va_list * args)
+{
+ tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+ int i, last = -1;
+
+ for (i = 0; i < TCP_CFG_N_FLAG_BITS; i++)
+ if (tc->cfg_flags & (1 << i))
+ last = i;
+ for (i = 0; i < last; i++)
+ {
+ if (tc->cfg_flags & (1 << i))
+ s = format (s, "%s, ", tcp_cfg_flags_str[i]);
+ }
+ if (last >= 0)
+ s = format (s, "%s", tcp_cfg_flags_str[last]);
+ return s;
+}
+
+const char *tcp_connection_flags_str[] = {
+#define _(sym, str) str,
+ foreach_tcp_connection_flag
+#undef _
+};
+
+static u8 *
+format_tcp_connection_flags (u8 * s, va_list * args)
+{
+ tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+ int i, last = -1;
+
+ for (i = 0; i < TCP_CONN_N_FLAG_BITS; i++)
+ if (tc->flags & (1 << i))
+ last = i;
+ for (i = 0; i < last; i++)
+ {
+ if (tc->flags & (1 << i))
+ s = format (s, "%s, ", tcp_connection_flags_str[i]);
+ }
+ if (last >= 0)
+ s = format (s, "%s", tcp_connection_flags_str[last]);
+ return s;
+}
+
+const char *tcp_conn_timers[] = {
+#define _(sym, str) str,
+ foreach_tcp_timer
+#undef _
+};
+
+static u8 *
+format_tcp_timers (u8 * s, va_list * args)
+{
+ tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+ int i, last = -1;
+
+ for (i = 0; i < TCP_N_TIMERS; i++)
+ if (tc->timers[i] != TCP_TIMER_HANDLE_INVALID)
+ last = i;
+
+ for (i = 0; i < last; i++)
+ {
+ if (tc->timers[i] != TCP_TIMER_HANDLE_INVALID)
+ s = format (s, "%s,", tcp_conn_timers[i]);
+ }
+
+ if (last >= 0)
+ s = format (s, "%s", tcp_conn_timers[i]);
+
+ return s;
+}
+
+static u8 *
+format_tcp_congestion_status (u8 * s, va_list * args)
+{
+ tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+ if (tcp_in_recovery (tc))
+ s = format (s, "recovery");
+ else if (tcp_in_fastrecovery (tc))
+ s = format (s, "fastrecovery");
+ else
+ s = format (s, "none");
+ return s;
+}
+
+static i32
+tcp_rcv_wnd_available (tcp_connection_t * tc)
+{
+ return (i32) tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las);
+}
+
+static u8 *
+format_tcp_congestion (u8 * s, va_list * args)
+{
+ tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+ u32 indent = format_get_indent (s), prr_space = 0;
+
+ s = format (s, "%U ", format_tcp_congestion_status, tc);
+ s = format (s, "algo %s cwnd %u ssthresh %u bytes_acked %u\n",
+ tc->cc_algo->name, tc->cwnd, tc->ssthresh, tc->bytes_acked);
+ s = format (s, "%Ucc space %u prev_cwnd %u prev_ssthresh %u\n",
+ format_white_space, indent, tcp_available_cc_snd_space (tc),
+ tc->prev_cwnd, tc->prev_ssthresh);
+ s = format (s, "%Usnd_cong %u dupack %u limited_tx %u\n",
+ format_white_space, indent, tc->snd_congestion - tc->iss,
+ tc->rcv_dupacks, tc->limited_transmit - tc->iss);
+ s = format (s, "%Urxt_bytes %u rxt_delivered %u rxt_head %u rxt_ts %u\n",
+ format_white_space, indent, tc->snd_rxt_bytes,
+ tc->rxt_delivered, tc->rxt_head - tc->iss,
+ tcp_time_now_w_thread (tc->c_thread_index) - tc->snd_rxt_ts);
+ if (tcp_in_fastrecovery (tc))
+ prr_space = tcp_fastrecovery_prr_snd_space (tc);
+ s = format (s, "%Uprr_start %u prr_delivered %u prr space %u\n",
+ format_white_space, indent, tc->prr_start - tc->iss,
+ tc->prr_delivered, prr_space);
+ return s;
+}
+
+static u8 *
+format_tcp_stats (u8 * s, va_list * args)
+{
+ tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+ u32 indent = format_get_indent (s);
+ s = format (s, "in segs %lu dsegs %lu bytes %lu dupacks %u\n",
+ tc->segs_in, tc->data_segs_in, tc->bytes_in, tc->dupacks_in);
+ s = format (s, "%Uout segs %lu dsegs %lu bytes %lu dupacks %u\n",
+ format_white_space, indent, tc->segs_out,
+ tc->data_segs_out, tc->bytes_out, tc->dupacks_out);
+ s = format (s, "%Ufr %u tr %u rxt segs %lu bytes %lu duration %.3f\n",
+ format_white_space, indent, tc->fr_occurences,
+ tc->tr_occurences, tc->segs_retrans, tc->bytes_retrans,
+ tcp_time_now_us (tc->c_thread_index) - tc->start_ts);
+ s = format (s, "%Uerr wnd data below %u above %u ack below %u above %u",
+ format_white_space, indent, tc->errors.below_data_wnd,
+ tc->errors.above_data_wnd, tc->errors.below_ack_wnd,
+ tc->errors.above_ack_wnd);
+ return s;
+}
+
+static u8 *
+format_tcp_vars (u8 * s, va_list * args)
+{
+ tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+ s = format (s, " index: %u cfg: %U flags: %U timers: %U\n", tc->c_c_index,
+ format_tcp_cfg_flags, tc, format_tcp_connection_flags, tc,
+ format_tcp_timers, tc);
+ s = format (s, " snd_una %u snd_nxt %u snd_una_max %u",
+ tc->snd_una - tc->iss, tc->snd_nxt - tc->iss,
+ tc->snd_una_max - tc->iss);
+ s = format (s, " rcv_nxt %u rcv_las %u\n",
+ tc->rcv_nxt - tc->irs, tc->rcv_las - tc->irs);
+ s = format (s, " snd_wnd %u rcv_wnd %u rcv_wscale %u ",
+ tc->snd_wnd, tc->rcv_wnd, tc->rcv_wscale);
+ s = format (s, "snd_wl1 %u snd_wl2 %u\n", tc->snd_wl1 - tc->irs,
+ tc->snd_wl2 - tc->iss);
+ s = format (s, " flight size %u out space %u rcv_wnd_av %u",
+ tcp_flight_size (tc), tcp_available_output_snd_space (tc),
+ tcp_rcv_wnd_available (tc));
+ s = format (s, " tsval_recent %u\n", tc->tsval_recent);
+ s = format (s, " tsecr %u tsecr_last_ack %u tsval_recent_age %u",
+ tc->rcv_opts.tsecr, tc->tsecr_last_ack,
+ tcp_time_now () - tc->tsval_recent_age);
+ s = format (s, " snd_mss %u\n", tc->snd_mss);
+ s = format (s, " rto %u rto_boff %u srtt %u us %.3f rttvar %u rtt_ts %.4f",
+ tc->rto, tc->rto_boff, tc->srtt, tc->mrtt_us * 1000, tc->rttvar,
+ tc->rtt_ts);
+ s = format (s, " rtt_seq %u\n", tc->rtt_seq - tc->iss);
+ s = format (s, " next_node %u opaque 0x%x fib_index %u\n",
+ tc->next_node_index, tc->next_node_opaque, tc->c_fib_index);
+ s = format (s, " cong: %U", format_tcp_congestion, tc);
+
+ if (tc->state >= TCP_STATE_ESTABLISHED)
+ {
+ s = format (s, " sboard: %U\n", format_tcp_scoreboard, &tc->sack_sb,
+ tc);
+ s = format (s, " stats: %U\n", format_tcp_stats, tc);
+ }
+ if (vec_len (tc->snd_sacks))
+ s = format (s, " sacks tx: %U\n", format_tcp_sacks, tc);
+
+ return s;
+}
+
+u8 *
+format_tcp_connection_id (u8 * s, va_list * args)
+{
+ tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+ if (!tc)
+ return s;
+ if (tc->c_is_ip4)
+ {
+ s = format (s, "[%d:%d][%s] %U:%d->%U:%d", tc->c_thread_index,
+ tc->c_s_index, "T", format_ip4_address, &tc->c_lcl_ip4,
+ clib_net_to_host_u16 (tc->c_lcl_port), format_ip4_address,
+ &tc->c_rmt_ip4, clib_net_to_host_u16 (tc->c_rmt_port));
+ }
+ else
+ {
+ s = format (s, "[%d:%d][%s] %U:%d->%U:%d", tc->c_thread_index,
+ tc->c_s_index, "T", format_ip6_address, &tc->c_lcl_ip6,
+ clib_net_to_host_u16 (tc->c_lcl_port), format_ip6_address,
+ &tc->c_rmt_ip6, clib_net_to_host_u16 (tc->c_rmt_port));
+ }
+
+ return s;
+}
+
+u8 *
+format_tcp_connection (u8 * s, va_list * args)
+{
+ tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+ u32 verbose = va_arg (*args, u32);
+
+ if (!tc)
+ return s;
+ s = format (s, "%-50U", format_tcp_connection_id, tc);
+ if (verbose)
+ {
+ s = format (s, "%-15U", format_tcp_state, tc->state);
+ if (verbose > 1)
+ s = format (s, "\n%U", format_tcp_vars, tc);
+ }
+
+ return s;
+}
+
+u8 *
+format_tcp_sacks (u8 * s, va_list * args)
+{
+ tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+ sack_block_t *sacks = tc->snd_sacks;
+ sack_block_t *block;
+ int i, len = 0;
+
+ len = vec_len (sacks);
+ for (i = 0; i < len - 1; i++)
+ {
+ block = &sacks[i];
+ s = format (s, " start %u end %u\n", block->start - tc->irs,
+ block->end - tc->irs);
+ }
+ if (len)
+ {
+ block = &sacks[len - 1];
+ s = format (s, " start %u end %u", block->start - tc->irs,
+ block->end - tc->irs);
+ }
+ return s;
+}
+
+u8 *
+format_tcp_rcv_sacks (u8 * s, va_list * args)
+{
+ tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+ sack_block_t *sacks = tc->rcv_opts.sacks;
+ sack_block_t *block;
+ int i, len = 0;
+
+ len = vec_len (sacks);
+ for (i = 0; i < len - 1; i++)
+ {
+ block = &sacks[i];
+ s = format (s, " start %u end %u\n", block->start - tc->iss,
+ block->end - tc->iss);
+ }
+ if (len)
+ {
+ block = &sacks[len - 1];
+ s = format (s, " start %u end %u", block->start - tc->iss,
+ block->end - tc->iss);
+ }
+ return s;
+}
+
+static u8 *
+format_tcp_sack_hole (u8 * s, va_list * args)
+{
+ sack_scoreboard_hole_t *hole = va_arg (*args, sack_scoreboard_hole_t *);
+ tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+ if (tc)
+ s = format (s, " [%u, %u]", hole->start - tc->iss, hole->end - tc->iss);
+ else
+ s = format (s, " [%u, %u]", hole->start, hole->end);
+ return s;
+}
+
+u8 *
+format_tcp_scoreboard (u8 * s, va_list * args)
+{
+ sack_scoreboard_t *sb = va_arg (*args, sack_scoreboard_t *);
+ tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
+ sack_scoreboard_hole_t *hole;
+ u32 indent = format_get_indent (s);
+
+ s = format (s, "sacked %u last_sacked %u lost %u last_lost %u"
+ " rxt_sacked %u\n",
+ sb->sacked_bytes, sb->last_sacked_bytes, sb->lost_bytes,
+ sb->last_lost_bytes, sb->rxt_sacked);
+ s = format (s, "%Ulast_delivered %u high_sacked %u is_reneging %u\n",
+ format_white_space, indent, sb->last_bytes_delivered,
+ sb->high_sacked - tc->iss, sb->is_reneging);
+ s = format (s, "%Ucur_rxt_hole %u high_rxt %u rescue_rxt %u",
+ format_white_space, indent, sb->cur_rxt_hole,
+ sb->high_rxt - tc->iss, sb->rescue_rxt - tc->iss);
+
+ hole = scoreboard_first_hole (sb);
+ if (hole)
+ s = format (s, "\n%Uhead %u tail %u %u holes:\n%U", format_white_space,
+ indent, sb->head, sb->tail, pool_elts (sb->holes),
+ format_white_space, indent);
+
+ while (hole)
+ {
+ s = format (s, "%U", format_tcp_sack_hole, hole, tc);
+ hole = scoreboard_next_hole (sb, hole);
+ }
+
+ return s;
+}
+
+/**
+ * \brief Configure an ipv4 source address range
+ * @param vm vlib_main_t pointer
+ * @param start first ipv4 address in the source address range
+ * @param end last ipv4 address in the source address range
+ * @param table_id VRF / table ID, 0 for the default FIB
+ * @return 0 if all OK, else an error indication from api_errno.h
+ */
+
+int
+tcp_configure_v4_source_address_range (vlib_main_t * vm,
+ ip4_address_t * start,
+ ip4_address_t * end, u32 table_id)
+{
+ u32 start_host_byte_order, end_host_byte_order;
+ fib_prefix_t prefix;
+ fib_node_index_t fei;
+ u32 fib_index = 0;
+ u32 sw_if_index;
+ int rv;
+
+ clib_memset (&prefix, 0, sizeof (prefix));
+
+ fib_index = fib_table_find (FIB_PROTOCOL_IP4, table_id);
+
+ if (fib_index == ~0)
+ return VNET_API_ERROR_NO_SUCH_FIB;
+
+ start_host_byte_order = clib_net_to_host_u32 (start->as_u32);
+ end_host_byte_order = clib_net_to_host_u32 (end->as_u32);
+
+ /* sanity check for reversed args or some such */
+ if ((end_host_byte_order - start_host_byte_order) > (10 << 10))
+ return VNET_API_ERROR_INVALID_ARGUMENT;
+
+ /* Lookup the last address, to identify the interface involved */
+ prefix.fp_len = 32;
+ prefix.fp_proto = FIB_PROTOCOL_IP4;
+ memcpy (&prefix.fp_addr.ip4, end, sizeof (ip4_address_t));
+
+ fei = fib_table_lookup (fib_index, &prefix);
+
+ /* Couldn't find route to destination. Bail out. */
+ if (fei == FIB_NODE_INDEX_INVALID)
+ return VNET_API_ERROR_NEXT_HOP_NOT_IN_FIB;
+
+ sw_if_index = fib_entry_get_resolving_interface (fei);
+
+ /* Configure proxy arp across the range */
+ rv = ip4_neighbor_proxy_add (fib_index, start, end);
+
+ if (rv)
+ return rv;
+
+ rv = ip4_neighbor_proxy_enable (sw_if_index);
+
+ if (rv)
+ return rv;
+
+ do
+ {
+ dpo_id_t dpo = DPO_INVALID;
+
+ vec_add1 (tcp_cfg.ip4_src_addrs, start[0]);
+
+ /* Add local adjacencies for the range */
+
+ receive_dpo_add_or_lock (DPO_PROTO_IP4, ~0 /* sw_if_index */ ,
+ NULL, &dpo);
+ prefix.fp_len = 32;
+ prefix.fp_proto = FIB_PROTOCOL_IP4;
+ prefix.fp_addr.ip4.as_u32 = start->as_u32;
+
+ fib_table_entry_special_dpo_update (fib_index,
+ &prefix,
+ FIB_SOURCE_API,
+ FIB_ENTRY_FLAG_EXCLUSIVE, &dpo);
+ dpo_reset (&dpo);
+
+ start_host_byte_order++;
+ start->as_u32 = clib_host_to_net_u32 (start_host_byte_order);
+ }
+ while (start_host_byte_order <= end_host_byte_order);
+
+ return 0;
+}
+
+/**
+ * \brief Configure an ipv6 source address range
+ * @param vm vlib_main_t pointer
+ * @param start first ipv6 address in the source address range
+ * @param end last ipv6 address in the source address range
+ * @param table_id VRF / table ID, 0 for the default FIB
+ * @return 0 if all OK, else an error indication from api_errno.h
+ */
+
+int
+tcp_configure_v6_source_address_range (vlib_main_t * vm,
+ ip6_address_t * start,
+ ip6_address_t * end, u32 table_id)
+{
+ fib_prefix_t prefix;
+ u32 fib_index = 0;
+ fib_node_index_t fei;
+ u32 sw_if_index;
+
+ clib_memset (&prefix, 0, sizeof (prefix));
+
+ fib_index = fib_table_find (FIB_PROTOCOL_IP6, table_id);
+
+ if (fib_index == ~0)
+ return VNET_API_ERROR_NO_SUCH_FIB;
+
+ while (1)
+ {
+ int i;
+ ip6_address_t tmp;
+ dpo_id_t dpo = DPO_INVALID;
+
+ /* Remember this address */
+ vec_add1 (tcp_cfg.ip6_src_addrs, start[0]);
+
+ /* Lookup the prefix, to identify the interface involved */
+ prefix.fp_len = 128;
+ prefix.fp_proto = FIB_PROTOCOL_IP6;
+ memcpy (&prefix.fp_addr.ip6, start, sizeof (ip6_address_t));
+
+ fei = fib_table_lookup (fib_index, &prefix);
+
+ /* Couldn't find route to destination. Bail out. */
+ if (fei == FIB_NODE_INDEX_INVALID)
+ return VNET_API_ERROR_NEXT_HOP_NOT_IN_FIB;
+
+ sw_if_index = fib_entry_get_resolving_interface (fei);
+
+ if (sw_if_index == (u32) ~ 0)
+ return VNET_API_ERROR_NO_MATCHING_INTERFACE;
+
+ /* Add a proxy neighbor discovery entry for this address */
+ ip6_neighbor_proxy_add (sw_if_index, start);
+
+ /* Add a receive adjacency for this address */
+ receive_dpo_add_or_lock (DPO_PROTO_IP6, ~0 /* sw_if_index */ ,
+ NULL, &dpo);
+
+ fib_table_entry_special_dpo_update (fib_index,
+ &prefix,
+ FIB_SOURCE_API,
+ FIB_ENTRY_FLAG_EXCLUSIVE, &dpo);
+ dpo_reset (&dpo);
+
+ /* Done with the entire range? */
+ if (!memcmp (start, end, sizeof (start[0])))
+ break;
+
+ /* Increment the address. DGMS. */
+ tmp = start[0];
+ for (i = 15; i >= 0; i--)
+ {
+ tmp.as_u8[i] += 1;
+ if (tmp.as_u8[i] != 0)
+ break;
+ }
+ start[0] = tmp;
+ }
+ return 0;
+}
+
+static clib_error_t *
+tcp_src_address_fn (vlib_main_t * vm,
+ unformat_input_t * input, vlib_cli_command_t * cmd_arg)
+{
+ ip4_address_t v4start, v4end;
+ ip6_address_t v6start, v6end;
+ u32 table_id = 0;
+ int v4set = 0;
+ int v6set = 0;
+ int rv;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "%U - %U", unformat_ip4_address, &v4start,
+ unformat_ip4_address, &v4end))
+ v4set = 1;
+ else if (unformat (input, "%U", unformat_ip4_address, &v4start))
+ {
+ memcpy (&v4end, &v4start, sizeof (v4start));
+ v4set = 1;
+ }
+ else if (unformat (input, "%U - %U", unformat_ip6_address, &v6start,
+ unformat_ip6_address, &v6end))
+ v6set = 1;
+ else if (unformat (input, "%U", unformat_ip6_address, &v6start))
+ {
+ memcpy (&v6end, &v6start, sizeof (v6start));
+ v6set = 1;
+ }
+ else if (unformat (input, "fib-table %d", &table_id))
+ ;
+ else
+ break;
+ }
+
+ if (!v4set && !v6set)
+ return clib_error_return (0, "at least one v4 or v6 address required");
+
+ if (v4set)
+ {
+ rv = tcp_configure_v4_source_address_range (vm, &v4start, &v4end,
+ table_id);
+ switch (rv)
+ {
+ case 0:
+ break;
+
+ case VNET_API_ERROR_NO_SUCH_FIB:
+ return clib_error_return (0, "Invalid table-id %d", table_id);
+
+ case VNET_API_ERROR_INVALID_ARGUMENT:
+ return clib_error_return (0, "Invalid address range %U - %U",
+ format_ip4_address, &v4start,
+ format_ip4_address, &v4end);
+ default:
+ return clib_error_return (0, "error %d", rv);
+ break;
+ }
+ }
+ if (v6set)
+ {
+ rv = tcp_configure_v6_source_address_range (vm, &v6start, &v6end,
+ table_id);
+ switch (rv)
+ {
+ case 0:
+ break;
+
+ case VNET_API_ERROR_NO_SUCH_FIB:
+ return clib_error_return (0, "Invalid table-id %d", table_id);
+
+ default:
+ return clib_error_return (0, "error %d", rv);
+ break;
+ }
+ }
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (tcp_src_address_command, static) =
+{
+ .path = "tcp src-address",
+ .short_help = "tcp src-address <ip-addr> [- <ip-addr>] add src address range",
+ .function = tcp_src_address_fn,
+};
+/* *INDENT-ON* */
+
+static u8 *
+tcp_scoreboard_dump_trace (u8 * s, sack_scoreboard_t * sb)
+{
+#if TCP_SCOREBOARD_TRACE
+
+ scoreboard_trace_elt_t *block;
+ int i = 0;
+
+ if (!sb->trace)
+ return s;
+
+ s = format (s, "scoreboard trace:");
+ vec_foreach (block, sb->trace)
+ {
+ s = format (s, "{%u, %u, %u, %u, %u}, ", block->start, block->end,
+ block->ack, block->snd_una_max, block->group);
+ if ((++i % 3) == 0)
+ s = format (s, "\n");
+ }
+ return s;
+#else
+ return 0;
+#endif
+}
+
+static clib_error_t *
+tcp_show_scoreboard_trace_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd_arg)
+{
+ transport_connection_t *tconn = 0;
+ tcp_connection_t *tc;
+ u8 *s = 0;
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "%U", unformat_transport_connection, &tconn,
+ TRANSPORT_PROTO_TCP))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ if (!TCP_SCOREBOARD_TRACE)
+ {
+ vlib_cli_output (vm, "scoreboard tracing not enabled");
+ return 0;
+ }
+
+ tc = tcp_get_connection_from_transport (tconn);
+ s = tcp_scoreboard_dump_trace (s, &tc->sack_sb);
+ vlib_cli_output (vm, "%v", s);
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (tcp_show_scoreboard_trace_command, static) =
+{
+ .path = "show tcp scoreboard trace",
+ .short_help = "show tcp scoreboard trace <connection>",
+ .function = tcp_show_scoreboard_trace_fn,
+};
+/* *INDENT-ON* */
+
+u8 *
+tcp_scoreboard_replay (u8 * s, tcp_connection_t * tc, u8 verbose)
+{
+ int i, trace_len;
+ scoreboard_trace_elt_t *trace;
+ u32 next_ack, left, group, has_new_ack = 0;
+ tcp_connection_t _dummy_tc, *dummy_tc = &_dummy_tc;
+ sack_block_t *block;
+
+ if (!TCP_SCOREBOARD_TRACE)
+ {
+ s = format (s, "scoreboard tracing not enabled");
+ return s;
+ }
+
+ if (!tc)
+ return s;
+
+ clib_memset (dummy_tc, 0, sizeof (*dummy_tc));
+ tcp_connection_timers_init (dummy_tc);
+ scoreboard_init (&dummy_tc->sack_sb);
+ dummy_tc->rcv_opts.flags |= TCP_OPTS_FLAG_SACK;
+
+#if TCP_SCOREBOARD_TRACE
+ trace = tc->sack_sb.trace;
+ trace_len = vec_len (tc->sack_sb.trace);
+#endif
+
+ for (i = 0; i < trace_len; i++)
+ {
+ if (trace[i].ack != 0)
+ {
+ dummy_tc->snd_una = trace[i].ack - 1448;
+ dummy_tc->snd_una_max = trace[i].ack;
+ }
+ }
+
+ left = 0;
+ while (left < trace_len)
+ {
+ group = trace[left].group;
+ vec_reset_length (dummy_tc->rcv_opts.sacks);
+ has_new_ack = 0;
+ while (trace[left].group == group)
+ {
+ if (trace[left].ack != 0)
+ {
+ if (verbose)
+ s = format (s, "Adding ack %u, snd_una_max %u, segs: ",
+ trace[left].ack, trace[left].snd_una_max);
+ dummy_tc->snd_una_max = trace[left].snd_una_max;
+ next_ack = trace[left].ack;
+ has_new_ack = 1;
+ }
+ else
+ {
+ if (verbose)
+ s = format (s, "[%u, %u], ", trace[left].start,
+ trace[left].end);
+ vec_add2 (dummy_tc->rcv_opts.sacks, block, 1);
+ block->start = trace[left].start;
+ block->end = trace[left].end;
+ }
+ left++;
+ }
+
+ /* Push segments */
+ tcp_rcv_sacks (dummy_tc, next_ack);
+ if (has_new_ack)
+ dummy_tc->snd_una = next_ack;
+
+ if (verbose)
+ s = format (s, "result: %U", format_tcp_scoreboard,
+ &dummy_tc->sack_sb);
+
+ }
+ s = format (s, "result: %U", format_tcp_scoreboard, &dummy_tc->sack_sb);
+
+ return s;
+}
+
+static clib_error_t *
+tcp_scoreboard_trace_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd_arg)
+{
+ transport_connection_t *tconn = 0;
+ tcp_connection_t *tc = 0;
+ u8 *str = 0;
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "%U", unformat_transport_connection, &tconn,
+ TRANSPORT_PROTO_TCP))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ if (!TCP_SCOREBOARD_TRACE)
+ {
+ vlib_cli_output (vm, "scoreboard tracing not enabled");
+ return 0;
+ }
+
+ tc = tcp_get_connection_from_transport (tconn);
+ if (!tc)
+ {
+ vlib_cli_output (vm, "connection not found");
+ return 0;
+ }
+ str = tcp_scoreboard_replay (str, tc, 1);
+ vlib_cli_output (vm, "%v", str);
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (tcp_replay_scoreboard_command, static) =
+{
+ .path = "tcp replay scoreboard",
+ .short_help = "tcp replay scoreboard <connection>",
+ .function = tcp_scoreboard_trace_fn,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+show_tcp_punt_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd_arg)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ if (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ return clib_error_return (0, "unknown input `%U'", format_unformat_error,
+ input);
+ vlib_cli_output (vm, "IPv4 TCP punt: %s",
+ tm->punt_unknown4 ? "enabled" : "disabled");
+ vlib_cli_output (vm, "IPv6 TCP punt: %s",
+ tm->punt_unknown6 ? "enabled" : "disabled");
+ return 0;
+}
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (show_tcp_punt_command, static) =
+{
+ .path = "show tcp punt",
+ .short_help = "show tcp punt",
+ .function = show_tcp_punt_fn,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+show_tcp_stats_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ tcp_worker_ctx_t *wrk;
+ u32 thread;
+
+ if (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ return clib_error_return (0, "unknown input `%U'", format_unformat_error,
+ input);
+ for (thread = 0; thread < vec_len (tm->wrk_ctx); thread++)
+ {
+ wrk = tcp_get_worker (thread);
+ vlib_cli_output (vm, "Thread %u:\n", thread);
+
+ if (clib_fifo_elts (wrk->pending_timers))
+ vlib_cli_output (vm, " %lu pending timers",
+ clib_fifo_elts (wrk->pending_timers));
+
+#define _(name,type,str) \
+ if (wrk->stats.name) \
+ vlib_cli_output (vm, " %lu %s", wrk->stats.name, str);
+ foreach_tcp_wrk_stat
+#undef _
+ }
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (show_tcp_stats_command, static) =
+{
+ .path = "show tcp stats",
+ .short_help = "show tcp stats",
+ .function = show_tcp_stats_fn,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+clear_tcp_stats_fn (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ tcp_worker_ctx_t *wrk;
+ u32 thread;
+
+ if (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ return clib_error_return (0, "unknown input `%U'", format_unformat_error,
+ input);
+
+ for (thread = 0; thread < vec_len (tm->wrk_ctx); thread++)
+ {
+ wrk = tcp_get_worker (thread);
+ clib_memset (&wrk->stats, 0, sizeof (wrk->stats));
+ }
+
+ return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (clear_tcp_stats_command, static) =
+{
+ .path = "clear tcp stats",
+ .short_help = "clear tcp stats",
+ .function = clear_tcp_stats_fn,
+};
+/* *INDENT-ON* */
+
+uword
+unformat_tcp_cc_algo (unformat_input_t * input, va_list * va)
+{
+ tcp_cc_algorithm_type_e *result = va_arg (*va, tcp_cc_algorithm_type_e *);
+ tcp_main_t *tm = &tcp_main;
+ char *cc_algo_name;
+ u8 found = 0;
+ uword *p;
+
+ if (unformat (input, "%s", &cc_algo_name)
+ && ((p = hash_get_mem (tm->cc_algo_by_name, cc_algo_name))))
+ {
+ *result = *p;
+ found = 1;
+ }
+
+ vec_free (cc_algo_name);
+ return found;
+}
+
+uword
+unformat_tcp_cc_algo_cfg (unformat_input_t * input, va_list * va)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ tcp_cc_algorithm_t *cc_alg;
+ unformat_input_t sub_input;
+ int found = 0;
+
+ vec_foreach (cc_alg, tm->cc_algos)
+ {
+ if (!unformat (input, cc_alg->name))
+ continue;
+
+ if (cc_alg->unformat_cfg
+ && unformat (input, "%U", unformat_vlib_cli_sub_input, &sub_input))
+ {
+ if (cc_alg->unformat_cfg (&sub_input))
+ found = 1;
+ }
+ }
+ return found;
+}
+
+static clib_error_t *
+tcp_config_fn (vlib_main_t * vm, unformat_input_t * input)
+{
+ u32 cwnd_multiplier, tmp_time;
+ uword memory_size;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "preallocated-connections %d",
+ &tcp_cfg.preallocated_connections))
+ ;
+ else if (unformat (input, "preallocated-half-open-connections %d",
+ &tcp_cfg.preallocated_half_open_connections))
+ ;
+ else if (unformat (input, "buffer-fail-fraction %f",
+ &tcp_cfg.buffer_fail_fraction))
+ ;
+ else if (unformat (input, "max-rx-fifo %U", unformat_memory_size,
+ &memory_size))
+ {
+ if (memory_size >= 0x100000000)
+ {
+ return clib_error_return
+ (0, "max-rx-fifo %llu (0x%llx) too large", memory_size,
+ memory_size);
+ }
+ tcp_cfg.max_rx_fifo = memory_size;
+ }
+ else if (unformat (input, "min-rx-fifo %U", unformat_memory_size,
+ &memory_size))
+ {
+ if (memory_size >= 0x100000000)
+ {
+ return clib_error_return
+ (0, "min-rx-fifo %llu (0x%llx) too large", memory_size,
+ memory_size);
+ }
+ tcp_cfg.min_rx_fifo = memory_size;
+ }
+ else if (unformat (input, "mtu %u", &tcp_cfg.default_mtu))
+ ;
+ else if (unformat (input, "rwnd-min-update-ack %d",
+ &tcp_cfg.rwnd_min_update_ack))
+ ;
+ else if (unformat (input, "initial-cwnd-multiplier %u",
+ &cwnd_multiplier))
+ tcp_cfg.initial_cwnd_multiplier = cwnd_multiplier;
+ else if (unformat (input, "no-tx-pacing"))
+ tcp_cfg.enable_tx_pacing = 0;
+ else if (unformat (input, "tso"))
+ tcp_cfg.allow_tso = 1;
+ else if (unformat (input, "no-csum-offload"))
+ tcp_cfg.csum_offload = 0;
+ else if (unformat (input, "cc-algo %U", unformat_tcp_cc_algo,
+ &tcp_cfg.cc_algo))
+ ;
+ else if (unformat (input, "%U", unformat_tcp_cc_algo_cfg))
+ ;
+ else if (unformat (input, "closewait-time %u", &tmp_time))
+ tcp_cfg.closewait_time = tmp_time / TCP_TIMER_TICK;
+ else if (unformat (input, "timewait-time %u", &tmp_time))
+ tcp_cfg.timewait_time = tmp_time / TCP_TIMER_TICK;
+ else if (unformat (input, "finwait1-time %u", &tmp_time))
+ tcp_cfg.finwait1_time = tmp_time / TCP_TIMER_TICK;
+ else if (unformat (input, "finwait2-time %u", &tmp_time))
+ tcp_cfg.finwait2_time = tmp_time / TCP_TIMER_TICK;
+ else if (unformat (input, "lastack-time %u", &tmp_time))
+ tcp_cfg.lastack_time = tmp_time / TCP_TIMER_TICK;
+ else if (unformat (input, "closing-time %u", &tmp_time))
+ tcp_cfg.closing_time = tmp_time / TCP_TIMER_TICK;
+ else if (unformat (input, "cleanup-time %u", &tmp_time))
+ tcp_cfg.cleanup_time = tmp_time / 1000.0;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ return 0;
+}
+
+VLIB_CONFIG_FUNCTION (tcp_config_fn, "tcp");
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/tcp/tcp_cubic.c b/src/vnet/tcp/tcp_cubic.c
index b79ef83..b8ac80a 100644
--- a/src/vnet/tcp/tcp_cubic.c
+++ b/src/vnet/tcp/tcp_cubic.c
@@ -14,6 +14,7 @@
*/
#include <vnet/tcp/tcp.h>
+#include <vnet/tcp/tcp_inlines.h>
#include <math.h>
#define beta_cubic 0.7
diff --git a/src/vnet/tcp/tcp_inlines.h b/src/vnet/tcp/tcp_inlines.h
new file mode 100644
index 0000000..2281cd3
--- /dev/null
+++ b/src/vnet/tcp/tcp_inlines.h
@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2020 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SRC_VNET_TCP_TCP_INLINES_H_
+#define SRC_VNET_TCP_TCP_INLINES_H_
+
+#include <vnet/tcp/tcp.h>
+
+always_inline tcp_header_t *
+tcp_buffer_hdr (vlib_buffer_t * b)
+{
+ ASSERT ((signed) b->current_data >= (signed) -VLIB_BUFFER_PRE_DATA_SIZE);
+ return (tcp_header_t *) (b->data + b->current_data
+ + vnet_buffer (b)->tcp.hdr_offset);
+}
+
+always_inline tcp_connection_t *
+tcp_connection_get (u32 conn_index, u32 thread_index)
+{
+ tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index);
+ if (PREDICT_FALSE (pool_is_free_index (wrk->connections, conn_index)))
+ return 0;
+ return pool_elt_at_index (wrk->connections, conn_index);
+}
+
+always_inline tcp_connection_t *
+tcp_connection_get_if_valid (u32 conn_index, u32 thread_index)
+{
+ tcp_worker_ctx_t *wrk;
+ if (thread_index >= vec_len (tcp_main.wrk_ctx))
+ return 0;
+ wrk = tcp_get_worker (thread_index);
+ if (pool_is_free_index (wrk->connections, conn_index))
+ return 0;
+ return pool_elt_at_index (wrk->connections, conn_index);
+}
+
+always_inline void
+tcp_connection_set_state (tcp_connection_t * tc, tcp_state_t state)
+{
+ tc->state = state;
+ TCP_EVT (TCP_EVT_STATE_CHANGE, tc);
+}
+
+always_inline tcp_connection_t *
+tcp_listener_get (u32 tli)
+{
+ tcp_connection_t *tc = 0;
+ if (!pool_is_free_index (tcp_main.listener_pool, tli))
+ tc = pool_elt_at_index (tcp_main.listener_pool, tli);
+ return tc;
+}
+
+always_inline tcp_connection_t *
+tcp_half_open_connection_get (u32 conn_index)
+{
+ tcp_connection_t *tc = 0;
+ clib_spinlock_lock_if_init (&tcp_main.half_open_lock);
+ if (!pool_is_free_index (tcp_main.half_open_connections, conn_index))
+ tc = pool_elt_at_index (tcp_main.half_open_connections, conn_index);
+ clib_spinlock_unlock_if_init (&tcp_main.half_open_lock);
+ return tc;
+}
+
+/**
+ * Our estimate of the number of bytes that have left the network
+ */
+always_inline u32
+tcp_bytes_out (const tcp_connection_t * tc)
+{
+ if (tcp_opts_sack_permitted (&tc->rcv_opts))
+ return tc->sack_sb.sacked_bytes + tc->sack_sb.lost_bytes;
+ else
+ return clib_min (tc->rcv_dupacks * tc->snd_mss,
+ tc->snd_nxt - tc->snd_una);
+}
+
+/**
+ * Our estimate of the number of bytes in flight (pipe size)
+ */
+always_inline u32
+tcp_flight_size (const tcp_connection_t * tc)
+{
+ int flight_size;
+
+ flight_size = (int) (tc->snd_nxt - tc->snd_una) - tcp_bytes_out (tc)
+ + tc->snd_rxt_bytes - tc->rxt_delivered;
+
+ ASSERT (flight_size >= 0);
+
+ return flight_size;
+}
+
+/**
+ * Initial cwnd as per RFC5681
+ */
+always_inline u32
+tcp_initial_cwnd (const tcp_connection_t * tc)
+{
+ if (tcp_cfg.initial_cwnd_multiplier > 0)
+ return tcp_cfg.initial_cwnd_multiplier * tc->snd_mss;
+
+ if (tc->snd_mss > 2190)
+ return 2 * tc->snd_mss;
+ else if (tc->snd_mss > 1095)
+ return 3 * tc->snd_mss;
+ else
+ return 4 * tc->snd_mss;
+}
+
+/*
+ * Accumulate acked bytes for cwnd increase
+ *
+ * Once threshold bytes are accumulated, snd_mss bytes are added
+ * to the cwnd.
+ */
+always_inline void
+tcp_cwnd_accumulate (tcp_connection_t * tc, u32 thresh, u32 bytes)
+{
+ tc->cwnd_acc_bytes += bytes;
+ if (tc->cwnd_acc_bytes >= thresh)
+ {
+ u32 inc = tc->cwnd_acc_bytes / thresh;
+ tc->cwnd_acc_bytes -= inc * thresh;
+ tc->cwnd += inc * tc->snd_mss;
+ tc->cwnd = clib_min (tc->cwnd, tc->tx_fifo_size);
+ }
+}
+
+always_inline u32
+tcp_loss_wnd (const tcp_connection_t * tc)
+{
+ /* Whatever we have in flight + the packet we're about to send */
+ return tcp_flight_size (tc) + tc->snd_mss;
+}
+
+always_inline u32
+tcp_available_snd_wnd (const tcp_connection_t * tc)
+{
+ return clib_min (tc->cwnd, tc->snd_wnd);
+}
+
+always_inline u32
+tcp_available_output_snd_space (const tcp_connection_t * tc)
+{
+ u32 available_wnd = tcp_available_snd_wnd (tc);
+ int flight_size = (int) (tc->snd_nxt - tc->snd_una);
+
+ if (available_wnd <= flight_size)
+ return 0;
+
+ return available_wnd - flight_size;
+}
+
+/**
+ * Estimate of how many bytes we can still push into the network
+ */
+always_inline u32
+tcp_available_cc_snd_space (const tcp_connection_t * tc)
+{
+ u32 available_wnd = tcp_available_snd_wnd (tc);
+ u32 flight_size = tcp_flight_size (tc);
+
+ if (available_wnd <= flight_size)
+ return 0;
+
+ return available_wnd - flight_size;
+}
+
+always_inline u8
+tcp_is_lost_fin (tcp_connection_t * tc)
+{
+ if ((tc->flags & TCP_CONN_FINSNT) && (tc->snd_una_max - tc->snd_una == 1))
+ return 1;
+ return 0;
+}
+
+always_inline u32
+tcp_time_now (void)
+{
+ return tcp_main.wrk_ctx[vlib_get_thread_index ()].time_now;
+}
+
+always_inline u32
+tcp_time_now_w_thread (u32 thread_index)
+{
+ return tcp_main.wrk_ctx[thread_index].time_now;
+}
+
+/**
+ * Generate timestamp for tcp connection
+ */
+always_inline u32
+tcp_tstamp (tcp_connection_t * tc)
+{
+ return (tcp_main.wrk_ctx[tc->c_thread_index].time_now -
+ tc->timestamp_delta);
+}
+
+always_inline f64
+tcp_time_now_us (u32 thread_index)
+{
+ return transport_time_now (thread_index);
+}
+
+always_inline u32
+tcp_set_time_now (tcp_worker_ctx_t * wrk)
+{
+ wrk->time_now = clib_cpu_time_now () * tcp_main.tstamp_ticks_per_clock;
+ return wrk->time_now;
+}
+
+always_inline tcp_connection_t *
+tcp_input_lookup_buffer (vlib_buffer_t * b, u8 thread_index, u32 * error,
+ u8 is_ip4, u8 is_nolookup)
+{
+ u32 fib_index = vnet_buffer (b)->ip.fib_index;
+ int n_advance_bytes, n_data_bytes;
+ transport_connection_t *tc;
+ tcp_header_t *tcp;
+ u8 result = 0;
+
+ if (is_ip4)
+ {
+ ip4_header_t *ip4 = vlib_buffer_get_current (b);
+ int ip_hdr_bytes = ip4_header_bytes (ip4);
+ if (PREDICT_FALSE (b->current_length < ip_hdr_bytes + sizeof (*tcp)))
+ {
+ *error = TCP_ERROR_LENGTH;
+ return 0;
+ }
+ tcp = ip4_next_header (ip4);
+ vnet_buffer (b)->tcp.hdr_offset = (u8 *) tcp - (u8 *) ip4;
+ n_advance_bytes = (ip_hdr_bytes + tcp_header_bytes (tcp));
+ n_data_bytes = clib_net_to_host_u16 (ip4->length) - n_advance_bytes;
+
+ /* Length check. Checksum computed by ipx_local no need to compute again */
+ if (PREDICT_FALSE (n_data_bytes < 0))
+ {
+ *error = TCP_ERROR_LENGTH;
+ return 0;
+ }
+
+ if (!is_nolookup)
+ tc = session_lookup_connection_wt4 (fib_index, &ip4->dst_address,
+ &ip4->src_address, tcp->dst_port,
+ tcp->src_port,
+ TRANSPORT_PROTO_TCP, thread_index,
+ &result);
+ }
+ else
+ {
+ ip6_header_t *ip6 = vlib_buffer_get_current (b);
+ if (PREDICT_FALSE (b->current_length < sizeof (*ip6) + sizeof (*tcp)))
+ {
+ *error = TCP_ERROR_LENGTH;
+ return 0;
+ }
+ tcp = ip6_next_header (ip6);
+ vnet_buffer (b)->tcp.hdr_offset = (u8 *) tcp - (u8 *) ip6;
+ n_advance_bytes = tcp_header_bytes (tcp);
+ n_data_bytes = clib_net_to_host_u16 (ip6->payload_length)
+ - n_advance_bytes;
+ n_advance_bytes += sizeof (ip6[0]);
+
+ if (PREDICT_FALSE (n_data_bytes < 0))
+ {
+ *error = TCP_ERROR_LENGTH;
+ return 0;
+ }
+
+ if (!is_nolookup)
+ {
+ if (PREDICT_FALSE
+ (ip6_address_is_link_local_unicast (&ip6->dst_address)))
+ {
+ ip4_main_t *im = &ip4_main;
+ fib_index = vec_elt (im->fib_index_by_sw_if_index,
+ vnet_buffer (b)->sw_if_index[VLIB_RX]);
+ }
+
+ tc = session_lookup_connection_wt6 (fib_index, &ip6->dst_address,
+ &ip6->src_address,
+ tcp->dst_port, tcp->src_port,
+ TRANSPORT_PROTO_TCP,
+ thread_index, &result);
+ }
+ }
+
+ if (is_nolookup)
+ tc =
+ (transport_connection_t *) tcp_connection_get (vnet_buffer (b)->
+ tcp.connection_index,
+ thread_index);
+
+ vnet_buffer (b)->tcp.seq_number = clib_net_to_host_u32 (tcp->seq_number);
+ vnet_buffer (b)->tcp.ack_number = clib_net_to_host_u32 (tcp->ack_number);
+ vnet_buffer (b)->tcp.data_offset = n_advance_bytes;
+ vnet_buffer (b)->tcp.data_len = n_data_bytes;
+ vnet_buffer (b)->tcp.seq_end = vnet_buffer (b)->tcp.seq_number
+ + n_data_bytes;
+ vnet_buffer (b)->tcp.flags = 0;
+
+ *error = result ? TCP_ERROR_NONE + result : *error;
+
+ return tcp_get_connection_from_transport (tc);
+}
+
+/**
+ * Initialize connection by gleaning network and rcv params from buffer
+ *
+ * @param tc connection to initialize
+ * @param b buffer whose current data is pointing at ip
+ * @param is_ip4 flag set to 1 if using ip4
+ */
+always_inline void
+tcp_init_w_buffer (tcp_connection_t * tc, vlib_buffer_t * b, u8 is_ip4)
+{
+ tcp_header_t *th = tcp_buffer_hdr (b);
+
+ tc->c_lcl_port = th->dst_port;
+ tc->c_rmt_port = th->src_port;
+ tc->c_is_ip4 = is_ip4;
+
+ if (is_ip4)
+ {
+ ip4_header_t *ip4 = vlib_buffer_get_current (b);
+ tc->c_lcl_ip4.as_u32 = ip4->dst_address.as_u32;
+ tc->c_rmt_ip4.as_u32 = ip4->src_address.as_u32;
+ }
+ else
+ {
+ ip6_header_t *ip6 = vlib_buffer_get_current (b);
+ clib_memcpy_fast (&tc->c_lcl_ip6, &ip6->dst_address,
+ sizeof (ip6_address_t));
+ clib_memcpy_fast (&tc->c_rmt_ip6, &ip6->src_address,
+ sizeof (ip6_address_t));
+ }
+
+ tc->irs = vnet_buffer (b)->tcp.seq_number;
+ tc->rcv_nxt = vnet_buffer (b)->tcp.seq_number + 1;
+ tc->rcv_las = tc->rcv_nxt;
+ tc->sw_if_index = vnet_buffer (b)->sw_if_index[VLIB_RX];
+ tc->snd_wl1 = vnet_buffer (b)->tcp.seq_number;
+ tc->snd_wl2 = vnet_buffer (b)->tcp.ack_number;
+
+ /* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK}
+ * segments are used to initialize PAWS. */
+ if (tcp_opts_tstamp (&tc->rcv_opts))
+ {
+ tc->tsval_recent = tc->rcv_opts.tsval;
+ tc->tsval_recent_age = tcp_time_now ();
+ }
+
+ if (tcp_opts_wscale (&tc->rcv_opts))
+ tc->snd_wscale = tc->rcv_opts.wscale;
+
+ tc->snd_wnd = clib_net_to_host_u16 (th->window) << tc->snd_wscale;
+}
+
+always_inline void
+tcp_update_rto (tcp_connection_t * tc)
+{
+ tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX);
+ tc->rto = clib_max (tc->rto, TCP_RTO_MIN);
+}
+
+always_inline u8
+tcp_is_descheduled (tcp_connection_t * tc)
+{
+ return (transport_connection_is_descheduled (&tc->connection) ? 1 : 0);
+}
+
+/**
+ * Push TCP header to buffer
+ *
+ * @param vm - vlib_main
+ * @param b - buffer to write the header to
+ * @param sp_net - source port net order
+ * @param dp_net - destination port net order
+ * @param seq - sequence number net order
+ * @param ack - ack number net order
+ * @param tcp_hdr_opts_len - header and options length in bytes
+ * @param flags - header flags
+ * @param wnd - window size
+ *
+ * @return - pointer to start of TCP header
+ */
+always_inline void *
+vlib_buffer_push_tcp_net_order (vlib_buffer_t * b, u16 sp, u16 dp, u32 seq,
+ u32 ack, u8 tcp_hdr_opts_len, u8 flags,
+ u16 wnd)
+{
+ tcp_header_t *th;
+
+ th = vlib_buffer_push_uninit (b, tcp_hdr_opts_len);
+
+ th->src_port = sp;
+ th->dst_port = dp;
+ th->seq_number = seq;
+ th->ack_number = ack;
+ th->data_offset_and_reserved = (tcp_hdr_opts_len >> 2) << 4;
+ th->flags = flags;
+ th->window = wnd;
+ th->checksum = 0;
+ th->urgent_pointer = 0;
+ vnet_buffer (b)->l4_hdr_offset = (u8 *) th - b->data;
+ b->flags |= VNET_BUFFER_F_L4_HDR_OFFSET_VALID;
+ return th;
+}
+
+/**
+ * Push TCP header to buffer
+ *
+ * @param b - buffer to write the header to
+ * @param sp_net - source port net order
+ * @param dp_net - destination port net order
+ * @param seq - sequence number host order
+ * @param ack - ack number host order
+ * @param tcp_hdr_opts_len - header and options length in bytes
+ * @param flags - header flags
+ * @param wnd - window size
+ *
+ * @return - pointer to start of TCP header
+ */
+always_inline void *
+vlib_buffer_push_tcp (vlib_buffer_t * b, u16 sp_net, u16 dp_net, u32 seq,
+ u32 ack, u8 tcp_hdr_opts_len, u8 flags, u16 wnd)
+{
+ return vlib_buffer_push_tcp_net_order (b, sp_net, dp_net,
+ clib_host_to_net_u32 (seq),
+ clib_host_to_net_u32 (ack),
+ tcp_hdr_opts_len, flags,
+ clib_host_to_net_u16 (wnd));
+}
+
+#endif /* SRC_VNET_TCP_TCP_INLINES_H_ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index 86158b9..e27cffb 100755
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -16,8 +16,8 @@
#include <vppinfra/sparse_vec.h>
#include <vnet/fib/ip4_fib.h>
#include <vnet/fib/ip6_fib.h>
-#include <vnet/tcp/tcp_packet.h>
#include <vnet/tcp/tcp.h>
+#include <vnet/tcp/tcp_inlines.h>
#include <vnet/session/session.h>
#include <math.h>
@@ -116,119 +116,6 @@
}
/**
- * Parse TCP header options.
- *
- * @param th TCP header
- * @param to TCP options data structure to be populated
- * @param is_syn set if packet is syn
- * @return -1 if parsing failed
- */
-static inline int
-tcp_options_parse (tcp_header_t * th, tcp_options_t * to, u8 is_syn)
-{
- const u8 *data;
- u8 opt_len, opts_len, kind;
- int j;
- sack_block_t b;
-
- opts_len = (tcp_doff (th) << 2) - sizeof (tcp_header_t);
- data = (const u8 *) (th + 1);
-
- /* Zero out all flags but those set in SYN */
- to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE
- | TCP_OPTS_FLAG_TSTAMP | TCP_OPTS_FLAG_MSS);
-
- for (; opts_len > 0; opts_len -= opt_len, data += opt_len)
- {
- kind = data[0];
-
- /* Get options length */
- if (kind == TCP_OPTION_EOL)
- break;
- else if (kind == TCP_OPTION_NOOP)
- {
- opt_len = 1;
- continue;
- }
- else
- {
- /* broken options */
- if (opts_len < 2)
- return -1;
- opt_len = data[1];
-
- /* weird option length */
- if (opt_len < 2 || opt_len > opts_len)
- return -1;
- }
-
- /* Parse options */
- switch (kind)
- {
- case TCP_OPTION_MSS:
- if (!is_syn)
- break;
- if ((opt_len == TCP_OPTION_LEN_MSS) && tcp_syn (th))
- {
- to->flags |= TCP_OPTS_FLAG_MSS;
- to->mss = clib_net_to_host_u16 (*(u16 *) (data + 2));
- }
- break;
- case TCP_OPTION_WINDOW_SCALE:
- if (!is_syn)
- break;
- if ((opt_len == TCP_OPTION_LEN_WINDOW_SCALE) && tcp_syn (th))
- {
- to->flags |= TCP_OPTS_FLAG_WSCALE;
- to->wscale = data[2];
- if (to->wscale > TCP_MAX_WND_SCALE)
- to->wscale = TCP_MAX_WND_SCALE;
- }
- break;
- case TCP_OPTION_TIMESTAMP:
- if (is_syn)
- to->flags |= TCP_OPTS_FLAG_TSTAMP;
- if ((to->flags & TCP_OPTS_FLAG_TSTAMP)
- && opt_len == TCP_OPTION_LEN_TIMESTAMP)
- {
- to->tsval = clib_net_to_host_u32 (*(u32 *) (data + 2));
- to->tsecr = clib_net_to_host_u32 (*(u32 *) (data + 6));
- }
- break;
- case TCP_OPTION_SACK_PERMITTED:
- if (!is_syn)
- break;
- if (opt_len == TCP_OPTION_LEN_SACK_PERMITTED && tcp_syn (th))
- to->flags |= TCP_OPTS_FLAG_SACK_PERMITTED;
- break;
- case TCP_OPTION_SACK_BLOCK:
- /* If SACK permitted was not advertised or a SYN, break */
- if ((to->flags & TCP_OPTS_FLAG_SACK_PERMITTED) == 0 || tcp_syn (th))
- break;
-
- /* If too short or not correctly formatted, break */
- if (opt_len < 10 || ((opt_len - 2) % TCP_OPTION_LEN_SACK_BLOCK))
- break;
-
- to->flags |= TCP_OPTS_FLAG_SACK;
- to->n_sack_blocks = (opt_len - 2) / TCP_OPTION_LEN_SACK_BLOCK;
- vec_reset_length (to->sacks);
- for (j = 0; j < to->n_sack_blocks; j++)
- {
- b.start = clib_net_to_host_u32 (*(u32 *) (data + 2 + 8 * j));
- b.end = clib_net_to_host_u32 (*(u32 *) (data + 6 + 8 * j));
- vec_add1 (to->sacks, b);
- }
- break;
- default:
- /* Nothing to see here */
- continue;
- }
- }
- return 0;
-}
-
-/**
* RFC1323: Check against wrapped sequence numbers (PAWS). If we have
* timestamp to echo and it's less than tsval_recent, drop segment
* but still send an ACK in order to retain TCP's mechanism for detecting
@@ -565,15 +452,6 @@
}
}
-#ifndef CLIB_MARCH_VARIANT
-void
-tcp_update_rto (tcp_connection_t * tc)
-{
- tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX);
- tc->rto = clib_max (tc->rto, TCP_RTO_MIN);
-}
-#endif /* CLIB_MARCH_VARIANT */
-
/**
* Update RTT estimate and RTO timer
*
@@ -726,567 +604,6 @@
tc->burst_acked += tc->bytes_acked;
}
-#ifndef CLIB_MARCH_VARIANT
-static u32
-scoreboard_hole_index (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole)
-{
- ASSERT (!pool_is_free_index (sb->holes, hole - sb->holes));
- return hole - sb->holes;
-}
-
-static u32
-scoreboard_hole_bytes (sack_scoreboard_hole_t * hole)
-{
- return hole->end - hole->start;
-}
-
-sack_scoreboard_hole_t *
-scoreboard_get_hole (sack_scoreboard_t * sb, u32 index)
-{
- if (index != TCP_INVALID_SACK_HOLE_INDEX)
- return pool_elt_at_index (sb->holes, index);
- return 0;
-}
-
-sack_scoreboard_hole_t *
-scoreboard_next_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole)
-{
- if (hole->next != TCP_INVALID_SACK_HOLE_INDEX)
- return pool_elt_at_index (sb->holes, hole->next);
- return 0;
-}
-
-sack_scoreboard_hole_t *
-scoreboard_prev_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole)
-{
- if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX)
- return pool_elt_at_index (sb->holes, hole->prev);
- return 0;
-}
-
-sack_scoreboard_hole_t *
-scoreboard_first_hole (sack_scoreboard_t * sb)
-{
- if (sb->head != TCP_INVALID_SACK_HOLE_INDEX)
- return pool_elt_at_index (sb->holes, sb->head);
- return 0;
-}
-
-sack_scoreboard_hole_t *
-scoreboard_last_hole (sack_scoreboard_t * sb)
-{
- if (sb->tail != TCP_INVALID_SACK_HOLE_INDEX)
- return pool_elt_at_index (sb->holes, sb->tail);
- return 0;
-}
-
-static void
-scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole)
-{
- sack_scoreboard_hole_t *next, *prev;
-
- if (hole->next != TCP_INVALID_SACK_HOLE_INDEX)
- {
- next = pool_elt_at_index (sb->holes, hole->next);
- next->prev = hole->prev;
- }
- else
- {
- sb->tail = hole->prev;
- }
-
- if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX)
- {
- prev = pool_elt_at_index (sb->holes, hole->prev);
- prev->next = hole->next;
- }
- else
- {
- sb->head = hole->next;
- }
-
- if (scoreboard_hole_index (sb, hole) == sb->cur_rxt_hole)
- sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
-
- /* Poison the entry */
- if (CLIB_DEBUG > 0)
- clib_memset (hole, 0xfe, sizeof (*hole));
-
- pool_put (sb->holes, hole);
-}
-
-static sack_scoreboard_hole_t *
-scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index,
- u32 start, u32 end)
-{
- sack_scoreboard_hole_t *hole, *next, *prev;
- u32 hole_index;
-
- pool_get (sb->holes, hole);
- clib_memset (hole, 0, sizeof (*hole));
-
- hole->start = start;
- hole->end = end;
- hole_index = scoreboard_hole_index (sb, hole);
-
- prev = scoreboard_get_hole (sb, prev_index);
- if (prev)
- {
- hole->prev = prev_index;
- hole->next = prev->next;
-
- if ((next = scoreboard_next_hole (sb, hole)))
- next->prev = hole_index;
- else
- sb->tail = hole_index;
-
- prev->next = hole_index;
- }
- else
- {
- sb->head = hole_index;
- hole->prev = TCP_INVALID_SACK_HOLE_INDEX;
- hole->next = TCP_INVALID_SACK_HOLE_INDEX;
- }
-
- return hole;
-}
-
-always_inline void
-scoreboard_update_sacked_rxt (sack_scoreboard_t * sb, u32 start, u32 end,
- u8 has_rxt)
-{
- if (!has_rxt || seq_geq (start, sb->high_rxt))
- return;
-
- sb->rxt_sacked +=
- seq_lt (end, sb->high_rxt) ? (end - start) : (sb->high_rxt - start);
-}
-
-always_inline void
-scoreboard_update_bytes (sack_scoreboard_t * sb, u32 ack, u32 snd_mss)
-{
- sack_scoreboard_hole_t *left, *right;
- u32 sacked = 0, blks = 0, old_sacked;
-
- old_sacked = sb->sacked_bytes;
-
- sb->last_lost_bytes = 0;
- sb->lost_bytes = 0;
- sb->sacked_bytes = 0;
-
- right = scoreboard_last_hole (sb);
- if (!right)
- {
- sb->sacked_bytes = sb->high_sacked - ack;
- sb->last_sacked_bytes = sb->sacked_bytes
- - (old_sacked - sb->last_bytes_delivered);
- return;
- }
-
- if (seq_gt (sb->high_sacked, right->end))
- {
- sacked = sb->high_sacked - right->end;
- blks = 1;
- }
-
- while (sacked < (TCP_DUPACK_THRESHOLD - 1) * snd_mss
- && blks < TCP_DUPACK_THRESHOLD)
- {
- if (right->is_lost)
- sb->lost_bytes += scoreboard_hole_bytes (right);
-
- left = scoreboard_prev_hole (sb, right);
- if (!left)
- {
- ASSERT (right->start == ack || sb->is_reneging);
- sacked += right->start - ack;
- right = 0;
- break;
- }
-
- sacked += right->start - left->end;
- blks++;
- right = left;
- }
-
- /* right is first lost */
- while (right)
- {
- sb->lost_bytes += scoreboard_hole_bytes (right);
- sb->last_lost_bytes += right->is_lost ? 0 : (right->end - right->start);
- right->is_lost = 1;
- left = scoreboard_prev_hole (sb, right);
- if (!left)
- {
- ASSERT (right->start == ack || sb->is_reneging);
- sacked += right->start - ack;
- break;
- }
- sacked += right->start - left->end;
- right = left;
- }
-
- sb->sacked_bytes = sacked;
- sb->last_sacked_bytes = sacked - (old_sacked - sb->last_bytes_delivered);
-}
-
-/**
- * Figure out the next hole to retransmit
- *
- * Follows logic proposed in RFC6675 Sec. 4, NextSeg()
- */
-sack_scoreboard_hole_t *
-scoreboard_next_rxt_hole (sack_scoreboard_t * sb,
- sack_scoreboard_hole_t * start,
- u8 have_unsent, u8 * can_rescue, u8 * snd_limited)
-{
- sack_scoreboard_hole_t *hole = 0;
-
- hole = start ? start : scoreboard_first_hole (sb);
- while (hole && seq_leq (hole->end, sb->high_rxt) && hole->is_lost)
- hole = scoreboard_next_hole (sb, hole);
-
- /* Nothing, return */
- if (!hole)
- {
- sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
- return 0;
- }
-
- /* Rule (1): if higher than rxt, less than high_sacked and lost */
- if (hole->is_lost && seq_lt (hole->start, sb->high_sacked))
- {
- sb->cur_rxt_hole = scoreboard_hole_index (sb, hole);
- }
- else
- {
- /* Rule (2): available unsent data */
- if (have_unsent)
- {
- sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
- return 0;
- }
- /* Rule (3): if hole not lost */
- else if (seq_lt (hole->start, sb->high_sacked))
- {
- /* And we didn't already retransmit it */
- if (seq_leq (hole->end, sb->high_rxt))
- {
- sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
- return 0;
- }
- *snd_limited = 0;
- sb->cur_rxt_hole = scoreboard_hole_index (sb, hole);
- }
- /* Rule (4): if hole beyond high_sacked */
- else
- {
- ASSERT (seq_geq (hole->start, sb->high_sacked));
- *snd_limited = 1;
- *can_rescue = 1;
- /* HighRxt MUST NOT be updated */
- return 0;
- }
- }
-
- if (hole && seq_lt (sb->high_rxt, hole->start))
- sb->high_rxt = hole->start;
-
- return hole;
-}
-
-void
-scoreboard_init_rxt (sack_scoreboard_t * sb, u32 snd_una)
-{
- sack_scoreboard_hole_t *hole;
- hole = scoreboard_first_hole (sb);
- if (hole)
- {
- snd_una = seq_gt (snd_una, hole->start) ? snd_una : hole->start;
- sb->cur_rxt_hole = sb->head;
- }
- sb->high_rxt = snd_una;
- sb->rescue_rxt = snd_una - 1;
-}
-
-void
-scoreboard_init (sack_scoreboard_t * sb)
-{
- sb->head = TCP_INVALID_SACK_HOLE_INDEX;
- sb->tail = TCP_INVALID_SACK_HOLE_INDEX;
- sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
-}
-
-void
-scoreboard_clear (sack_scoreboard_t * sb)
-{
- sack_scoreboard_hole_t *hole;
- while ((hole = scoreboard_first_hole (sb)))
- {
- scoreboard_remove_hole (sb, hole);
- }
- ASSERT (sb->head == sb->tail && sb->head == TCP_INVALID_SACK_HOLE_INDEX);
- ASSERT (pool_elts (sb->holes) == 0);
- sb->sacked_bytes = 0;
- sb->last_sacked_bytes = 0;
- sb->last_bytes_delivered = 0;
- sb->lost_bytes = 0;
- sb->last_lost_bytes = 0;
- sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
- sb->is_reneging = 0;
-}
-
-void
-scoreboard_clear_reneging (sack_scoreboard_t * sb, u32 start, u32 end)
-{
- sack_scoreboard_hole_t *last_hole;
-
- clib_warning ("sack reneging");
-
- scoreboard_clear (sb);
- last_hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX,
- start, end);
- last_hole->is_lost = 1;
- sb->tail = scoreboard_hole_index (sb, last_hole);
- sb->high_sacked = start;
- scoreboard_init_rxt (sb, start);
-}
-
-#endif /* CLIB_MARCH_VARIANT */
-
-/**
- * Test that scoreboard is sane after recovery
- *
- * Returns 1 if scoreboard is empty or if first hole beyond
- * snd_una.
- */
-static u8
-tcp_scoreboard_is_sane_post_recovery (tcp_connection_t * tc)
-{
- sack_scoreboard_hole_t *hole;
- hole = scoreboard_first_hole (&tc->sack_sb);
- return (!hole || (seq_geq (hole->start, tc->snd_una)
- && seq_lt (hole->end, tc->snd_nxt)));
-}
-
-#ifndef CLIB_MARCH_VARIANT
-
-void
-tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
-{
- sack_scoreboard_hole_t *hole, *next_hole;
- sack_scoreboard_t *sb = &tc->sack_sb;
- sack_block_t *blk, *rcv_sacks;
- u32 blk_index = 0, i, j;
- u8 has_rxt;
-
- sb->last_sacked_bytes = 0;
- sb->last_bytes_delivered = 0;
- sb->rxt_sacked = 0;
-
- if (!tcp_opts_sack (&tc->rcv_opts) && !sb->sacked_bytes
- && sb->head == TCP_INVALID_SACK_HOLE_INDEX)
- return;
-
- has_rxt = tcp_in_cong_recovery (tc);
-
- /* Remove invalid blocks */
- blk = tc->rcv_opts.sacks;
- while (blk < vec_end (tc->rcv_opts.sacks))
- {
- if (seq_lt (blk->start, blk->end)
- && seq_gt (blk->start, tc->snd_una)
- && seq_gt (blk->start, ack) && seq_leq (blk->end, tc->snd_nxt))
- {
- blk++;
- continue;
- }
- vec_del1 (tc->rcv_opts.sacks, blk - tc->rcv_opts.sacks);
- }
-
- /* Add block for cumulative ack */
- if (seq_gt (ack, tc->snd_una))
- {
- vec_add2 (tc->rcv_opts.sacks, blk, 1);
- blk->start = tc->snd_una;
- blk->end = ack;
- }
-
- if (vec_len (tc->rcv_opts.sacks) == 0)
- return;
-
- tcp_scoreboard_trace_add (tc, ack);
-
- /* Make sure blocks are ordered */
- rcv_sacks = tc->rcv_opts.sacks;
- for (i = 0; i < vec_len (rcv_sacks); i++)
- for (j = i + 1; j < vec_len (rcv_sacks); j++)
- if (seq_lt (rcv_sacks[j].start, rcv_sacks[i].start))
- {
- sack_block_t tmp = rcv_sacks[i];
- rcv_sacks[i] = rcv_sacks[j];
- rcv_sacks[j] = tmp;
- }
-
- if (sb->head == TCP_INVALID_SACK_HOLE_INDEX)
- {
- /* Handle reneging as a special case */
- if (PREDICT_FALSE (sb->is_reneging))
- {
- /* No holes, only sacked bytes */
- if (seq_leq (tc->snd_nxt, sb->high_sacked))
- {
- /* No progress made so return */
- if (seq_leq (ack, tc->snd_una))
- return;
-
- /* Update sacked bytes delivered and return */
- sb->last_bytes_delivered = ack - tc->snd_una;
- sb->sacked_bytes -= sb->last_bytes_delivered;
- sb->is_reneging = seq_lt (ack, sb->high_sacked);
- return;
- }
-
- /* New hole above high sacked. Add it and process normally */
- hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX,
- sb->high_sacked, tc->snd_nxt);
- sb->tail = scoreboard_hole_index (sb, hole);
- }
- /* Not reneging and no holes. Insert the first that covers all
- * outstanding bytes */
- else
- {
- hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX,
- tc->snd_una, tc->snd_nxt);
- sb->tail = scoreboard_hole_index (sb, hole);
- }
- sb->high_sacked = rcv_sacks[vec_len (rcv_sacks) - 1].end;
- }
- else
- {
- /* If we have holes but snd_nxt is beyond the last hole, update
- * last hole end or add new hole after high sacked */
- hole = scoreboard_last_hole (sb);
- if (seq_gt (tc->snd_nxt, hole->end))
- {
- if (seq_geq (hole->start, sb->high_sacked))
- {
- hole->end = tc->snd_nxt;
- }
- /* New hole after high sacked block */
- else if (seq_lt (sb->high_sacked, tc->snd_nxt))
- {
- scoreboard_insert_hole (sb, sb->tail, sb->high_sacked,
- tc->snd_nxt);
- }
- }
-
- /* Keep track of max byte sacked for when the last hole
- * is acked */
- sb->high_sacked = seq_max (rcv_sacks[vec_len (rcv_sacks) - 1].end,
- sb->high_sacked);
- }
-
- /* Walk the holes with the SACK blocks */
- hole = pool_elt_at_index (sb->holes, sb->head);
-
- if (PREDICT_FALSE (sb->is_reneging))
- {
- sb->last_bytes_delivered += clib_min (hole->start - tc->snd_una,
- ack - tc->snd_una);
- sb->is_reneging = seq_lt (ack, hole->start);
- }
-
- while (hole && blk_index < vec_len (rcv_sacks))
- {
- blk = &rcv_sacks[blk_index];
- if (seq_leq (blk->start, hole->start))
- {
- /* Block covers hole. Remove hole */
- if (seq_geq (blk->end, hole->end))
- {
- next_hole = scoreboard_next_hole (sb, hole);
-
- /* If covered by ack, compute delivered bytes */
- if (blk->end == ack)
- {
- u32 sacked = next_hole ? next_hole->start : sb->high_sacked;
- if (PREDICT_FALSE (seq_lt (ack, sacked)))
- {
- sb->last_bytes_delivered += ack - hole->end;
- sb->is_reneging = 1;
- }
- else
- {
- sb->last_bytes_delivered += sacked - hole->end;
- sb->is_reneging = 0;
- }
- }
- scoreboard_update_sacked_rxt (sb, hole->start, hole->end,
- has_rxt);
- scoreboard_remove_hole (sb, hole);
- hole = next_hole;
- }
- /* Partial 'head' overlap */
- else
- {
- if (seq_gt (blk->end, hole->start))
- {
- scoreboard_update_sacked_rxt (sb, hole->start, blk->end,
- has_rxt);
- hole->start = blk->end;
- }
- blk_index++;
- }
- }
- else
- {
- /* Hole must be split */
- if (seq_lt (blk->end, hole->end))
- {
- u32 hole_index = scoreboard_hole_index (sb, hole);
- next_hole = scoreboard_insert_hole (sb, hole_index, blk->end,
- hole->end);
- /* Pool might've moved */
- hole = scoreboard_get_hole (sb, hole_index);
- hole->end = blk->start;
-
- scoreboard_update_sacked_rxt (sb, blk->start, blk->end,
- has_rxt);
-
- blk_index++;
- ASSERT (hole->next == scoreboard_hole_index (sb, next_hole));
- }
- else if (seq_lt (blk->start, hole->end))
- {
- scoreboard_update_sacked_rxt (sb, blk->start, hole->end,
- has_rxt);
- hole->end = blk->start;
- }
- hole = scoreboard_next_hole (sb, hole);
- }
- }
-
- scoreboard_update_bytes (sb, ack, tc->snd_mss);
-
- ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes || tcp_in_recovery (tc));
- ASSERT (sb->sacked_bytes == 0 || tcp_in_recovery (tc)
- || sb->sacked_bytes <= tc->snd_nxt - seq_max (tc->snd_una, ack));
- ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_nxt
- - seq_max (tc->snd_una, ack) || tcp_in_recovery (tc));
- ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc)
- || sb->is_reneging || sb->holes[sb->head].start == ack);
- ASSERT (sb->last_lost_bytes <= sb->lost_bytes);
- ASSERT ((ack - tc->snd_una) + sb->last_sacked_bytes
- - sb->last_bytes_delivered >= sb->rxt_sacked);
- ASSERT ((ack - tc->snd_una) >= tc->sack_sb.last_bytes_delivered
- || (tc->flags & TCP_CONN_FINSNT));
-
- TCP_EVT (TCP_EVT_CC_SCOREBOARD, tc);
-}
-#endif /* CLIB_MARCH_VARIANT */
-
/**
* Try to update snd_wnd based on feedback received from peer.
*
@@ -1825,89 +1142,6 @@
*error = TCP_ERROR_FIN_RCVD;
}
-#ifndef CLIB_MARCH_VARIANT
-static u8
-tcp_sack_vector_is_sane (sack_block_t * sacks)
-{
- int i;
- for (i = 1; i < vec_len (sacks); i++)
- {
- if (sacks[i - 1].end == sacks[i].start)
- return 0;
- }
- return 1;
-}
-
-/**
- * Build SACK list as per RFC2018.
- *
- * Makes sure the first block contains the segment that generated the current
- * ACK and the following ones are the ones most recently reported in SACK
- * blocks.
- *
- * @param tc TCP connection for which the SACK list is updated
- * @param start Start sequence number of the newest SACK block
- * @param end End sequence of the newest SACK block
- */
-void
-tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end)
-{
- sack_block_t *new_list = tc->snd_sacks_fl, *block = 0;
- int i;
-
- /* If the first segment is ooo add it to the list. Last write might've moved
- * rcv_nxt over the first segment. */
- if (seq_lt (tc->rcv_nxt, start))
- {
- vec_add2 (new_list, block, 1);
- block->start = start;
- block->end = end;
- }
-
- /* Find the blocks still worth keeping. */
- for (i = 0; i < vec_len (tc->snd_sacks); i++)
- {
- /* Discard if rcv_nxt advanced beyond current block */
- if (seq_leq (tc->snd_sacks[i].start, tc->rcv_nxt))
- continue;
-
- /* Merge or drop if segment overlapped by the new segment */
- if (block && (seq_geq (tc->snd_sacks[i].end, new_list[0].start)
- && seq_leq (tc->snd_sacks[i].start, new_list[0].end)))
- {
- if (seq_lt (tc->snd_sacks[i].start, new_list[0].start))
- new_list[0].start = tc->snd_sacks[i].start;
- if (seq_lt (new_list[0].end, tc->snd_sacks[i].end))
- new_list[0].end = tc->snd_sacks[i].end;
- continue;
- }
-
- /* Save to new SACK list if we have space. */
- if (vec_len (new_list) < TCP_MAX_SACK_BLOCKS)
- vec_add1 (new_list, tc->snd_sacks[i]);
- }
-
- ASSERT (vec_len (new_list) <= TCP_MAX_SACK_BLOCKS);
-
- /* Replace old vector with new one */
- vec_reset_length (tc->snd_sacks);
- tc->snd_sacks_fl = tc->snd_sacks;
- tc->snd_sacks = new_list;
-
- /* Segments should not 'touch' */
- ASSERT (tcp_sack_vector_is_sane (tc->snd_sacks));
-}
-
-u32
-tcp_sack_list_bytes (tcp_connection_t * tc)
-{
- u32 bytes = 0, i;
- for (i = 0; i < vec_len (tc->snd_sacks); i++)
- bytes += tc->snd_sacks[i].end - tc->snd_sacks[i].start;
- return bytes;
-}
-#endif /* CLIB_MARCH_VARIANT */
-
/** Enqueue data for delivery to application */
static int
tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b,
@@ -3551,102 +2785,6 @@
}
}
-always_inline tcp_connection_t *
-tcp_input_lookup_buffer (vlib_buffer_t * b, u8 thread_index, u32 * error,
- u8 is_ip4, u8 is_nolookup)
-{
- u32 fib_index = vnet_buffer (b)->ip.fib_index;
- int n_advance_bytes, n_data_bytes;
- transport_connection_t *tc;
- tcp_header_t *tcp;
- u8 result = 0;
-
- if (is_ip4)
- {
- ip4_header_t *ip4 = vlib_buffer_get_current (b);
- int ip_hdr_bytes = ip4_header_bytes (ip4);
- if (PREDICT_FALSE (b->current_length < ip_hdr_bytes + sizeof (*tcp)))
- {
- *error = TCP_ERROR_LENGTH;
- return 0;
- }
- tcp = ip4_next_header (ip4);
- vnet_buffer (b)->tcp.hdr_offset = (u8 *) tcp - (u8 *) ip4;
- n_advance_bytes = (ip_hdr_bytes + tcp_header_bytes (tcp));
- n_data_bytes = clib_net_to_host_u16 (ip4->length) - n_advance_bytes;
-
- /* Length check. Checksum computed by ipx_local no need to compute again */
- if (PREDICT_FALSE (n_data_bytes < 0))
- {
- *error = TCP_ERROR_LENGTH;
- return 0;
- }
-
- if (!is_nolookup)
- tc = session_lookup_connection_wt4 (fib_index, &ip4->dst_address,
- &ip4->src_address, tcp->dst_port,
- tcp->src_port,
- TRANSPORT_PROTO_TCP, thread_index,
- &result);
- }
- else
- {
- ip6_header_t *ip6 = vlib_buffer_get_current (b);
- if (PREDICT_FALSE (b->current_length < sizeof (*ip6) + sizeof (*tcp)))
- {
- *error = TCP_ERROR_LENGTH;
- return 0;
- }
- tcp = ip6_next_header (ip6);
- vnet_buffer (b)->tcp.hdr_offset = (u8 *) tcp - (u8 *) ip6;
- n_advance_bytes = tcp_header_bytes (tcp);
- n_data_bytes = clib_net_to_host_u16 (ip6->payload_length)
- - n_advance_bytes;
- n_advance_bytes += sizeof (ip6[0]);
-
- if (PREDICT_FALSE (n_data_bytes < 0))
- {
- *error = TCP_ERROR_LENGTH;
- return 0;
- }
-
- if (!is_nolookup)
- {
- if (PREDICT_FALSE
- (ip6_address_is_link_local_unicast (&ip6->dst_address)))
- {
- ip4_main_t *im = &ip4_main;
- fib_index = vec_elt (im->fib_index_by_sw_if_index,
- vnet_buffer (b)->sw_if_index[VLIB_RX]);
- }
-
- tc = session_lookup_connection_wt6 (fib_index, &ip6->dst_address,
- &ip6->src_address,
- tcp->dst_port, tcp->src_port,
- TRANSPORT_PROTO_TCP,
- thread_index, &result);
- }
- }
-
- if (is_nolookup)
- tc =
- (transport_connection_t *) tcp_connection_get (vnet_buffer (b)->
- tcp.connection_index,
- thread_index);
-
- vnet_buffer (b)->tcp.seq_number = clib_net_to_host_u32 (tcp->seq_number);
- vnet_buffer (b)->tcp.ack_number = clib_net_to_host_u32 (tcp->ack_number);
- vnet_buffer (b)->tcp.data_offset = n_advance_bytes;
- vnet_buffer (b)->tcp.data_len = n_data_bytes;
- vnet_buffer (b)->tcp.seq_end = vnet_buffer (b)->tcp.seq_number
- + n_data_bytes;
- vnet_buffer (b)->tcp.flags = 0;
-
- *error = result ? TCP_ERROR_NONE + result : *error;
-
- return tcp_get_connection_from_transport (tc);
-}
-
static inline void
tcp_input_dispatch_buffer (tcp_main_t * tm, tcp_connection_t * tc,
vlib_buffer_t * b, u16 * next,
diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c
index 69dd224..c5ffc2a 100644
--- a/src/vnet/tcp/tcp_newreno.c
+++ b/src/vnet/tcp/tcp_newreno.c
@@ -14,6 +14,7 @@
*/
#include <vnet/tcp/tcp.h>
+#include <vnet/tcp/tcp_inlines.h>
typedef struct nwreno_cfg_
{
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index d07fb2e..aeeffa7 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -14,6 +14,7 @@
*/
#include <vnet/tcp/tcp.h>
+#include <vnet/tcp/tcp_inlines.h>
#include <math.h>
typedef enum _tcp_output_next
@@ -166,90 +167,6 @@
return tc->rcv_wnd >> tc->rcv_wscale;
}
-/**
- * Write TCP options to segment.
- */
-static u32
-tcp_options_write (u8 * data, tcp_options_t * opts)
-{
- u32 opts_len = 0;
- u32 buf, seq_len = 4;
-
- if (tcp_opts_mss (opts))
- {
- *data++ = TCP_OPTION_MSS;
- *data++ = TCP_OPTION_LEN_MSS;
- buf = clib_host_to_net_u16 (opts->mss);
- clib_memcpy_fast (data, &buf, sizeof (opts->mss));
- data += sizeof (opts->mss);
- opts_len += TCP_OPTION_LEN_MSS;
- }
-
- if (tcp_opts_wscale (opts))
- {
- *data++ = TCP_OPTION_WINDOW_SCALE;
- *data++ = TCP_OPTION_LEN_WINDOW_SCALE;
- *data++ = opts->wscale;
- opts_len += TCP_OPTION_LEN_WINDOW_SCALE;
- }
-
- if (tcp_opts_sack_permitted (opts))
- {
- *data++ = TCP_OPTION_SACK_PERMITTED;
- *data++ = TCP_OPTION_LEN_SACK_PERMITTED;
- opts_len += TCP_OPTION_LEN_SACK_PERMITTED;
- }
-
- if (tcp_opts_tstamp (opts))
- {
- *data++ = TCP_OPTION_TIMESTAMP;
- *data++ = TCP_OPTION_LEN_TIMESTAMP;
- buf = clib_host_to_net_u32 (opts->tsval);
- clib_memcpy_fast (data, &buf, sizeof (opts->tsval));
- data += sizeof (opts->tsval);
- buf = clib_host_to_net_u32 (opts->tsecr);
- clib_memcpy_fast (data, &buf, sizeof (opts->tsecr));
- data += sizeof (opts->tsecr);
- opts_len += TCP_OPTION_LEN_TIMESTAMP;
- }
-
- if (tcp_opts_sack (opts))
- {
- int i;
-
- if (opts->n_sack_blocks != 0)
- {
- *data++ = TCP_OPTION_SACK_BLOCK;
- *data++ = 2 + opts->n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK;
- for (i = 0; i < opts->n_sack_blocks; i++)
- {
- buf = clib_host_to_net_u32 (opts->sacks[i].start);
- clib_memcpy_fast (data, &buf, seq_len);
- data += seq_len;
- buf = clib_host_to_net_u32 (opts->sacks[i].end);
- clib_memcpy_fast (data, &buf, seq_len);
- data += seq_len;
- }
- opts_len += 2 + opts->n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK;
- }
- }
-
- /* Terminate TCP options */
- if (opts_len % 4)
- {
- *data++ = TCP_OPTION_EOL;
- opts_len += TCP_OPTION_LEN_EOL;
- }
-
- /* Pad with zeroes to a u32 boundary */
- while (opts_len % 4)
- {
- *data++ = TCP_OPTION_NOOP;
- opts_len += TCP_OPTION_LEN_NOOP;
- }
- return opts_len;
-}
-
static int
tcp_make_syn_options (tcp_connection_t * tc, tcp_options_t * opts)
{
@@ -563,7 +480,7 @@
/**
* Convert buffer to FIN-ACK
*/
-void
+static void
tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b)
{
tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_ACK);
@@ -598,7 +515,7 @@
/**
* Convert buffer to SYN-ACK
*/
-void
+static void
tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b)
{
tcp_options_t _snd_opts, *snd_opts = &_snd_opts;
diff --git a/src/vnet/tcp/tcp_packet.h b/src/vnet/tcp/tcp_packet.h
index 1e637a8..fcc55ff 100644
--- a/src/vnet/tcp/tcp_packet.h
+++ b/src/vnet/tcp/tcp_packet.h
@@ -172,6 +172,219 @@
#define TCP_MAX_WND_SCALE 14 /* See RFC 1323 */
#define TCP_OPTS_ALIGN 4
#define TCP_OPTS_MAX_SACK_BLOCKS 3
+
+/* Modulo arithmetic for TCP sequence numbers */
+#define seq_lt(_s1, _s2) ((i32)((_s1)-(_s2)) < 0)
+#define seq_leq(_s1, _s2) ((i32)((_s1)-(_s2)) <= 0)
+#define seq_gt(_s1, _s2) ((i32)((_s1)-(_s2)) > 0)
+#define seq_geq(_s1, _s2) ((i32)((_s1)-(_s2)) >= 0)
+#define seq_max(_s1, _s2) (seq_gt((_s1), (_s2)) ? (_s1) : (_s2))
+
+/* Modulo arithmetic for timestamps */
+#define timestamp_lt(_t1, _t2) ((i32)((_t1)-(_t2)) < 0)
+#define timestamp_leq(_t1, _t2) ((i32)((_t1)-(_t2)) <= 0)
+
+/**
+ * Parse TCP header options.
+ *
+ * @param th TCP header
+ * @param to TCP options data structure to be populated
+ * @param is_syn set if packet is syn
+ * @return -1 if parsing failed
+ */
+always_inline int
+tcp_options_parse (tcp_header_t * th, tcp_options_t * to, u8 is_syn)
+{
+ const u8 *data;
+ u8 opt_len, opts_len, kind;
+ int j;
+ sack_block_t b;
+
+ opts_len = (tcp_doff (th) << 2) - sizeof (tcp_header_t);
+ data = (const u8 *) (th + 1);
+
+ /* Zero out all flags but those set in SYN */
+ to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE
+ | TCP_OPTS_FLAG_TSTAMP | TCP_OPTS_FLAG_MSS);
+
+ for (; opts_len > 0; opts_len -= opt_len, data += opt_len)
+ {
+ kind = data[0];
+
+ /* Get options length */
+ if (kind == TCP_OPTION_EOL)
+ break;
+ else if (kind == TCP_OPTION_NOOP)
+ {
+ opt_len = 1;
+ continue;
+ }
+ else
+ {
+ /* broken options */
+ if (opts_len < 2)
+ return -1;
+ opt_len = data[1];
+
+ /* weird option length */
+ if (opt_len < 2 || opt_len > opts_len)
+ return -1;
+ }
+
+ /* Parse options */
+ switch (kind)
+ {
+ case TCP_OPTION_MSS:
+ if (!is_syn)
+ break;
+ if ((opt_len == TCP_OPTION_LEN_MSS) && tcp_syn (th))
+ {
+ to->flags |= TCP_OPTS_FLAG_MSS;
+ to->mss = clib_net_to_host_u16 (*(u16 *) (data + 2));
+ }
+ break;
+ case TCP_OPTION_WINDOW_SCALE:
+ if (!is_syn)
+ break;
+ if ((opt_len == TCP_OPTION_LEN_WINDOW_SCALE) && tcp_syn (th))
+ {
+ to->flags |= TCP_OPTS_FLAG_WSCALE;
+ to->wscale = data[2];
+ if (to->wscale > TCP_MAX_WND_SCALE)
+ to->wscale = TCP_MAX_WND_SCALE;
+ }
+ break;
+ case TCP_OPTION_TIMESTAMP:
+ if (is_syn)
+ to->flags |= TCP_OPTS_FLAG_TSTAMP;
+ if ((to->flags & TCP_OPTS_FLAG_TSTAMP)
+ && opt_len == TCP_OPTION_LEN_TIMESTAMP)
+ {
+ to->tsval = clib_net_to_host_u32 (*(u32 *) (data + 2));
+ to->tsecr = clib_net_to_host_u32 (*(u32 *) (data + 6));
+ }
+ break;
+ case TCP_OPTION_SACK_PERMITTED:
+ if (!is_syn)
+ break;
+ if (opt_len == TCP_OPTION_LEN_SACK_PERMITTED && tcp_syn (th))
+ to->flags |= TCP_OPTS_FLAG_SACK_PERMITTED;
+ break;
+ case TCP_OPTION_SACK_BLOCK:
+ /* If SACK permitted was not advertised or a SYN, break */
+ if ((to->flags & TCP_OPTS_FLAG_SACK_PERMITTED) == 0 || tcp_syn (th))
+ break;
+
+ /* If too short or not correctly formatted, break */
+ if (opt_len < 10 || ((opt_len - 2) % TCP_OPTION_LEN_SACK_BLOCK))
+ break;
+
+ to->flags |= TCP_OPTS_FLAG_SACK;
+ to->n_sack_blocks = (opt_len - 2) / TCP_OPTION_LEN_SACK_BLOCK;
+ vec_reset_length (to->sacks);
+ for (j = 0; j < to->n_sack_blocks; j++)
+ {
+ b.start = clib_net_to_host_u32 (*(u32 *) (data + 2 + 8 * j));
+ b.end = clib_net_to_host_u32 (*(u32 *) (data + 6 + 8 * j));
+ vec_add1 (to->sacks, b);
+ }
+ break;
+ default:
+ /* Nothing to see here */
+ continue;
+ }
+ }
+ return 0;
+}
+
+/**
+ * Write TCP options to segment.
+ *
+ * @param data buffer where to write the options
+ * @param opts options to write
+ * @return length of options written
+ */
+always_inline u32
+tcp_options_write (u8 * data, tcp_options_t * opts)
+{
+ u32 opts_len = 0;
+ u32 buf, seq_len = 4;
+
+ if (tcp_opts_mss (opts))
+ {
+ *data++ = TCP_OPTION_MSS;
+ *data++ = TCP_OPTION_LEN_MSS;
+ buf = clib_host_to_net_u16 (opts->mss);
+ clib_memcpy_fast (data, &buf, sizeof (opts->mss));
+ data += sizeof (opts->mss);
+ opts_len += TCP_OPTION_LEN_MSS;
+ }
+
+ if (tcp_opts_wscale (opts))
+ {
+ *data++ = TCP_OPTION_WINDOW_SCALE;
+ *data++ = TCP_OPTION_LEN_WINDOW_SCALE;
+ *data++ = opts->wscale;
+ opts_len += TCP_OPTION_LEN_WINDOW_SCALE;
+ }
+
+ if (tcp_opts_sack_permitted (opts))
+ {
+ *data++ = TCP_OPTION_SACK_PERMITTED;
+ *data++ = TCP_OPTION_LEN_SACK_PERMITTED;
+ opts_len += TCP_OPTION_LEN_SACK_PERMITTED;
+ }
+
+ if (tcp_opts_tstamp (opts))
+ {
+ *data++ = TCP_OPTION_TIMESTAMP;
+ *data++ = TCP_OPTION_LEN_TIMESTAMP;
+ buf = clib_host_to_net_u32 (opts->tsval);
+ clib_memcpy_fast (data, &buf, sizeof (opts->tsval));
+ data += sizeof (opts->tsval);
+ buf = clib_host_to_net_u32 (opts->tsecr);
+ clib_memcpy_fast (data, &buf, sizeof (opts->tsecr));
+ data += sizeof (opts->tsecr);
+ opts_len += TCP_OPTION_LEN_TIMESTAMP;
+ }
+
+ if (tcp_opts_sack (opts))
+ {
+ int i;
+
+ if (opts->n_sack_blocks != 0)
+ {
+ *data++ = TCP_OPTION_SACK_BLOCK;
+ *data++ = 2 + opts->n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK;
+ for (i = 0; i < opts->n_sack_blocks; i++)
+ {
+ buf = clib_host_to_net_u32 (opts->sacks[i].start);
+ clib_memcpy_fast (data, &buf, seq_len);
+ data += seq_len;
+ buf = clib_host_to_net_u32 (opts->sacks[i].end);
+ clib_memcpy_fast (data, &buf, seq_len);
+ data += seq_len;
+ }
+ opts_len += 2 + opts->n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK;
+ }
+ }
+
+ /* Terminate TCP options */
+ if (opts_len % 4)
+ {
+ *data++ = TCP_OPTION_EOL;
+ opts_len += TCP_OPTION_LEN_EOL;
+ }
+
+ /* Pad with zeroes to a u32 boundary */
+ while (opts_len % 4)
+ {
+ *data++ = TCP_OPTION_NOOP;
+ opts_len += TCP_OPTION_LEN_NOOP;
+ }
+ return opts_len;
+}
+
#endif /* included_tcp_packet_h */
/*
diff --git a/src/vnet/tcp/tcp_sack.c b/src/vnet/tcp/tcp_sack.c
new file mode 100644
index 0000000..3388dd6
--- /dev/null
+++ b/src/vnet/tcp/tcp_sack.c
@@ -0,0 +1,607 @@
+/*
+ * Copyright (c) 2020 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/tcp/tcp_sack.h>
+
+static void
+scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole)
+{
+ sack_scoreboard_hole_t *next, *prev;
+
+ if (hole->next != TCP_INVALID_SACK_HOLE_INDEX)
+ {
+ next = pool_elt_at_index (sb->holes, hole->next);
+ next->prev = hole->prev;
+ }
+ else
+ {
+ sb->tail = hole->prev;
+ }
+
+ if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX)
+ {
+ prev = pool_elt_at_index (sb->holes, hole->prev);
+ prev->next = hole->next;
+ }
+ else
+ {
+ sb->head = hole->next;
+ }
+
+ if (scoreboard_hole_index (sb, hole) == sb->cur_rxt_hole)
+ sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
+
+ /* Poison the entry */
+ if (CLIB_DEBUG > 0)
+ clib_memset (hole, 0xfe, sizeof (*hole));
+
+ pool_put (sb->holes, hole);
+}
+
+static sack_scoreboard_hole_t *
+scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index,
+ u32 start, u32 end)
+{
+ sack_scoreboard_hole_t *hole, *next, *prev;
+ u32 hole_index;
+
+ pool_get (sb->holes, hole);
+ clib_memset (hole, 0, sizeof (*hole));
+
+ hole->start = start;
+ hole->end = end;
+ hole_index = scoreboard_hole_index (sb, hole);
+
+ prev = scoreboard_get_hole (sb, prev_index);
+ if (prev)
+ {
+ hole->prev = prev_index;
+ hole->next = prev->next;
+
+ if ((next = scoreboard_next_hole (sb, hole)))
+ next->prev = hole_index;
+ else
+ sb->tail = hole_index;
+
+ prev->next = hole_index;
+ }
+ else
+ {
+ sb->head = hole_index;
+ hole->prev = TCP_INVALID_SACK_HOLE_INDEX;
+ hole->next = TCP_INVALID_SACK_HOLE_INDEX;
+ }
+
+ return hole;
+}
+
+always_inline void
+scoreboard_update_sacked_rxt (sack_scoreboard_t * sb, u32 start, u32 end,
+ u8 has_rxt)
+{
+ if (!has_rxt || seq_geq (start, sb->high_rxt))
+ return;
+
+ sb->rxt_sacked +=
+ seq_lt (end, sb->high_rxt) ? (end - start) : (sb->high_rxt - start);
+}
+
+always_inline void
+scoreboard_update_bytes (sack_scoreboard_t * sb, u32 ack, u32 snd_mss)
+{
+ sack_scoreboard_hole_t *left, *right;
+ u32 sacked = 0, blks = 0, old_sacked;
+
+ old_sacked = sb->sacked_bytes;
+
+ sb->last_lost_bytes = 0;
+ sb->lost_bytes = 0;
+ sb->sacked_bytes = 0;
+
+ right = scoreboard_last_hole (sb);
+ if (!right)
+ {
+ sb->sacked_bytes = sb->high_sacked - ack;
+ sb->last_sacked_bytes = sb->sacked_bytes
+ - (old_sacked - sb->last_bytes_delivered);
+ return;
+ }
+
+ if (seq_gt (sb->high_sacked, right->end))
+ {
+ sacked = sb->high_sacked - right->end;
+ blks = 1;
+ }
+
+ while (sacked < (TCP_DUPACK_THRESHOLD - 1) * snd_mss
+ && blks < TCP_DUPACK_THRESHOLD)
+ {
+ if (right->is_lost)
+ sb->lost_bytes += scoreboard_hole_bytes (right);
+
+ left = scoreboard_prev_hole (sb, right);
+ if (!left)
+ {
+ ASSERT (right->start == ack || sb->is_reneging);
+ sacked += right->start - ack;
+ right = 0;
+ break;
+ }
+
+ sacked += right->start - left->end;
+ blks++;
+ right = left;
+ }
+
+ /* right is first lost */
+ while (right)
+ {
+ sb->lost_bytes += scoreboard_hole_bytes (right);
+ sb->last_lost_bytes += right->is_lost ? 0 : (right->end - right->start);
+ right->is_lost = 1;
+ left = scoreboard_prev_hole (sb, right);
+ if (!left)
+ {
+ ASSERT (right->start == ack || sb->is_reneging);
+ sacked += right->start - ack;
+ break;
+ }
+ sacked += right->start - left->end;
+ right = left;
+ }
+
+ sb->sacked_bytes = sacked;
+ sb->last_sacked_bytes = sacked - (old_sacked - sb->last_bytes_delivered);
+}
+
+/**
+ * Figure out the next hole to retransmit
+ *
+ * Follows logic proposed in RFC6675 Sec. 4, NextSeg()
+ */
+sack_scoreboard_hole_t *
+scoreboard_next_rxt_hole (sack_scoreboard_t * sb,
+ sack_scoreboard_hole_t * start,
+ u8 have_unsent, u8 * can_rescue, u8 * snd_limited)
+{
+ sack_scoreboard_hole_t *hole = 0;
+
+ hole = start ? start : scoreboard_first_hole (sb);
+ while (hole && seq_leq (hole->end, sb->high_rxt) && hole->is_lost)
+ hole = scoreboard_next_hole (sb, hole);
+
+ /* Nothing, return */
+ if (!hole)
+ {
+ sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
+ return 0;
+ }
+
+ /* Rule (1): if higher than rxt, less than high_sacked and lost */
+ if (hole->is_lost && seq_lt (hole->start, sb->high_sacked))
+ {
+ sb->cur_rxt_hole = scoreboard_hole_index (sb, hole);
+ }
+ else
+ {
+ /* Rule (2): available unsent data */
+ if (have_unsent)
+ {
+ sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
+ return 0;
+ }
+ /* Rule (3): if hole not lost */
+ else if (seq_lt (hole->start, sb->high_sacked))
+ {
+ /* And we didn't already retransmit it */
+ if (seq_leq (hole->end, sb->high_rxt))
+ {
+ sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
+ return 0;
+ }
+ *snd_limited = 0;
+ sb->cur_rxt_hole = scoreboard_hole_index (sb, hole);
+ }
+ /* Rule (4): if hole beyond high_sacked */
+ else
+ {
+ ASSERT (seq_geq (hole->start, sb->high_sacked));
+ *snd_limited = 1;
+ *can_rescue = 1;
+ /* HighRxt MUST NOT be updated */
+ return 0;
+ }
+ }
+
+ if (hole && seq_lt (sb->high_rxt, hole->start))
+ sb->high_rxt = hole->start;
+
+ return hole;
+}
+
+void
+scoreboard_init_rxt (sack_scoreboard_t * sb, u32 snd_una)
+{
+ sack_scoreboard_hole_t *hole;
+ hole = scoreboard_first_hole (sb);
+ if (hole)
+ {
+ snd_una = seq_gt (snd_una, hole->start) ? snd_una : hole->start;
+ sb->cur_rxt_hole = sb->head;
+ }
+ sb->high_rxt = snd_una;
+ sb->rescue_rxt = snd_una - 1;
+}
+
+void
+scoreboard_init (sack_scoreboard_t * sb)
+{
+ sb->head = TCP_INVALID_SACK_HOLE_INDEX;
+ sb->tail = TCP_INVALID_SACK_HOLE_INDEX;
+ sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
+}
+
+void
+scoreboard_clear (sack_scoreboard_t * sb)
+{
+ sack_scoreboard_hole_t *hole;
+ while ((hole = scoreboard_first_hole (sb)))
+ {
+ scoreboard_remove_hole (sb, hole);
+ }
+ ASSERT (sb->head == sb->tail && sb->head == TCP_INVALID_SACK_HOLE_INDEX);
+ ASSERT (pool_elts (sb->holes) == 0);
+ sb->sacked_bytes = 0;
+ sb->last_sacked_bytes = 0;
+ sb->last_bytes_delivered = 0;
+ sb->lost_bytes = 0;
+ sb->last_lost_bytes = 0;
+ sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
+ sb->is_reneging = 0;
+}
+
+void
+scoreboard_clear_reneging (sack_scoreboard_t * sb, u32 start, u32 end)
+{
+ sack_scoreboard_hole_t *last_hole;
+
+ clib_warning ("sack reneging");
+
+ scoreboard_clear (sb);
+ last_hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX,
+ start, end);
+ last_hole->is_lost = 1;
+ sb->tail = scoreboard_hole_index (sb, last_hole);
+ sb->high_sacked = start;
+ scoreboard_init_rxt (sb, start);
+}
+
+/**
+ * Test that scoreboard is sane after recovery
+ *
+ * Returns 1 if scoreboard is empty or if first hole beyond
+ * snd_una.
+ */
+u8
+tcp_scoreboard_is_sane_post_recovery (tcp_connection_t * tc)
+{
+ sack_scoreboard_hole_t *hole;
+ hole = scoreboard_first_hole (&tc->sack_sb);
+ return (!hole || (seq_geq (hole->start, tc->snd_una)
+ && seq_lt (hole->end, tc->snd_nxt)));
+}
+
+void
+tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
+{
+ sack_scoreboard_hole_t *hole, *next_hole;
+ sack_scoreboard_t *sb = &tc->sack_sb;
+ sack_block_t *blk, *rcv_sacks;
+ u32 blk_index = 0, i, j;
+ u8 has_rxt;
+
+ sb->last_sacked_bytes = 0;
+ sb->last_bytes_delivered = 0;
+ sb->rxt_sacked = 0;
+
+ if (!tcp_opts_sack (&tc->rcv_opts) && !sb->sacked_bytes
+ && sb->head == TCP_INVALID_SACK_HOLE_INDEX)
+ return;
+
+ has_rxt = tcp_in_cong_recovery (tc);
+
+ /* Remove invalid blocks */
+ blk = tc->rcv_opts.sacks;
+ while (blk < vec_end (tc->rcv_opts.sacks))
+ {
+ if (seq_lt (blk->start, blk->end)
+ && seq_gt (blk->start, tc->snd_una)
+ && seq_gt (blk->start, ack) && seq_leq (blk->end, tc->snd_nxt))
+ {
+ blk++;
+ continue;
+ }
+ vec_del1 (tc->rcv_opts.sacks, blk - tc->rcv_opts.sacks);
+ }
+
+ /* Add block for cumulative ack */
+ if (seq_gt (ack, tc->snd_una))
+ {
+ vec_add2 (tc->rcv_opts.sacks, blk, 1);
+ blk->start = tc->snd_una;
+ blk->end = ack;
+ }
+
+ if (vec_len (tc->rcv_opts.sacks) == 0)
+ return;
+
+ tcp_scoreboard_trace_add (tc, ack);
+
+ /* Make sure blocks are ordered */
+ rcv_sacks = tc->rcv_opts.sacks;
+ for (i = 0; i < vec_len (rcv_sacks); i++)
+ for (j = i + 1; j < vec_len (rcv_sacks); j++)
+ if (seq_lt (rcv_sacks[j].start, rcv_sacks[i].start))
+ {
+ sack_block_t tmp = rcv_sacks[i];
+ rcv_sacks[i] = rcv_sacks[j];
+ rcv_sacks[j] = tmp;
+ }
+
+ if (sb->head == TCP_INVALID_SACK_HOLE_INDEX)
+ {
+ /* Handle reneging as a special case */
+ if (PREDICT_FALSE (sb->is_reneging))
+ {
+ /* No holes, only sacked bytes */
+ if (seq_leq (tc->snd_nxt, sb->high_sacked))
+ {
+ /* No progress made so return */
+ if (seq_leq (ack, tc->snd_una))
+ return;
+
+ /* Update sacked bytes delivered and return */
+ sb->last_bytes_delivered = ack - tc->snd_una;
+ sb->sacked_bytes -= sb->last_bytes_delivered;
+ sb->is_reneging = seq_lt (ack, sb->high_sacked);
+ return;
+ }
+
+ /* New hole above high sacked. Add it and process normally */
+ hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX,
+ sb->high_sacked, tc->snd_nxt);
+ sb->tail = scoreboard_hole_index (sb, hole);
+ }
+ /* Not reneging and no holes. Insert the first that covers all
+ * outstanding bytes */
+ else
+ {
+ hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX,
+ tc->snd_una, tc->snd_nxt);
+ sb->tail = scoreboard_hole_index (sb, hole);
+ }
+ sb->high_sacked = rcv_sacks[vec_len (rcv_sacks) - 1].end;
+ }
+ else
+ {
+ /* If we have holes but snd_nxt is beyond the last hole, update
+ * last hole end or add new hole after high sacked */
+ hole = scoreboard_last_hole (sb);
+ if (seq_gt (tc->snd_nxt, hole->end))
+ {
+ if (seq_geq (hole->start, sb->high_sacked))
+ {
+ hole->end = tc->snd_nxt;
+ }
+ /* New hole after high sacked block */
+ else if (seq_lt (sb->high_sacked, tc->snd_nxt))
+ {
+ scoreboard_insert_hole (sb, sb->tail, sb->high_sacked,
+ tc->snd_nxt);
+ }
+ }
+
+ /* Keep track of max byte sacked for when the last hole
+ * is acked */
+ sb->high_sacked = seq_max (rcv_sacks[vec_len (rcv_sacks) - 1].end,
+ sb->high_sacked);
+ }
+
+ /* Walk the holes with the SACK blocks */
+ hole = pool_elt_at_index (sb->holes, sb->head);
+
+ if (PREDICT_FALSE (sb->is_reneging))
+ {
+ sb->last_bytes_delivered += clib_min (hole->start - tc->snd_una,
+ ack - tc->snd_una);
+ sb->is_reneging = seq_lt (ack, hole->start);
+ }
+
+ while (hole && blk_index < vec_len (rcv_sacks))
+ {
+ blk = &rcv_sacks[blk_index];
+ if (seq_leq (blk->start, hole->start))
+ {
+ /* Block covers hole. Remove hole */
+ if (seq_geq (blk->end, hole->end))
+ {
+ next_hole = scoreboard_next_hole (sb, hole);
+
+ /* If covered by ack, compute delivered bytes */
+ if (blk->end == ack)
+ {
+ u32 sacked = next_hole ? next_hole->start : sb->high_sacked;
+ if (PREDICT_FALSE (seq_lt (ack, sacked)))
+ {
+ sb->last_bytes_delivered += ack - hole->end;
+ sb->is_reneging = 1;
+ }
+ else
+ {
+ sb->last_bytes_delivered += sacked - hole->end;
+ sb->is_reneging = 0;
+ }
+ }
+ scoreboard_update_sacked_rxt (sb, hole->start, hole->end,
+ has_rxt);
+ scoreboard_remove_hole (sb, hole);
+ hole = next_hole;
+ }
+ /* Partial 'head' overlap */
+ else
+ {
+ if (seq_gt (blk->end, hole->start))
+ {
+ scoreboard_update_sacked_rxt (sb, hole->start, blk->end,
+ has_rxt);
+ hole->start = blk->end;
+ }
+ blk_index++;
+ }
+ }
+ else
+ {
+ /* Hole must be split */
+ if (seq_lt (blk->end, hole->end))
+ {
+ u32 hole_index = scoreboard_hole_index (sb, hole);
+ next_hole = scoreboard_insert_hole (sb, hole_index, blk->end,
+ hole->end);
+ /* Pool might've moved */
+ hole = scoreboard_get_hole (sb, hole_index);
+ hole->end = blk->start;
+
+ scoreboard_update_sacked_rxt (sb, blk->start, blk->end,
+ has_rxt);
+
+ blk_index++;
+ ASSERT (hole->next == scoreboard_hole_index (sb, next_hole));
+ }
+ else if (seq_lt (blk->start, hole->end))
+ {
+ scoreboard_update_sacked_rxt (sb, blk->start, hole->end,
+ has_rxt);
+ hole->end = blk->start;
+ }
+ hole = scoreboard_next_hole (sb, hole);
+ }
+ }
+
+ scoreboard_update_bytes (sb, ack, tc->snd_mss);
+
+ ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes || tcp_in_recovery (tc));
+ ASSERT (sb->sacked_bytes == 0 || tcp_in_recovery (tc)
+ || sb->sacked_bytes <= tc->snd_nxt - seq_max (tc->snd_una, ack));
+ ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_nxt
+ - seq_max (tc->snd_una, ack) || tcp_in_recovery (tc));
+ ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc)
+ || sb->is_reneging || sb->holes[sb->head].start == ack);
+ ASSERT (sb->last_lost_bytes <= sb->lost_bytes);
+ ASSERT ((ack - tc->snd_una) + sb->last_sacked_bytes
+ - sb->last_bytes_delivered >= sb->rxt_sacked);
+ ASSERT ((ack - tc->snd_una) >= tc->sack_sb.last_bytes_delivered
+ || (tc->flags & TCP_CONN_FINSNT));
+
+ TCP_EVT (TCP_EVT_CC_SCOREBOARD, tc);
+}
+
+static u8
+tcp_sack_vector_is_sane (sack_block_t * sacks)
+{
+ int i;
+ for (i = 1; i < vec_len (sacks); i++)
+ {
+ if (sacks[i - 1].end == sacks[i].start)
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * Build SACK list as per RFC2018.
+ *
+ * Makes sure the first block contains the segment that generated the current
+ * ACK and the following ones are the ones most recently reported in SACK
+ * blocks.
+ *
+ * @param tc TCP connection for which the SACK list is updated
+ * @param start Start sequence number of the newest SACK block
+ * @param end End sequence of the newest SACK block
+ */
+void
+tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end)
+{
+ sack_block_t *new_list = tc->snd_sacks_fl, *block = 0;
+ int i;
+
+ /* If the first segment is ooo add it to the list. Last write might've moved
+ * rcv_nxt over the first segment. */
+ if (seq_lt (tc->rcv_nxt, start))
+ {
+ vec_add2 (new_list, block, 1);
+ block->start = start;
+ block->end = end;
+ }
+
+ /* Find the blocks still worth keeping. */
+ for (i = 0; i < vec_len (tc->snd_sacks); i++)
+ {
+ /* Discard if rcv_nxt advanced beyond current block */
+ if (seq_leq (tc->snd_sacks[i].start, tc->rcv_nxt))
+ continue;
+
+ /* Merge or drop if segment overlapped by the new segment */
+ if (block && (seq_geq (tc->snd_sacks[i].end, new_list[0].start)
+ && seq_leq (tc->snd_sacks[i].start, new_list[0].end)))
+ {
+ if (seq_lt (tc->snd_sacks[i].start, new_list[0].start))
+ new_list[0].start = tc->snd_sacks[i].start;
+ if (seq_lt (new_list[0].end, tc->snd_sacks[i].end))
+ new_list[0].end = tc->snd_sacks[i].end;
+ continue;
+ }
+
+ /* Save to new SACK list if we have space. */
+ if (vec_len (new_list) < TCP_MAX_SACK_BLOCKS)
+ vec_add1 (new_list, tc->snd_sacks[i]);
+ }
+
+ ASSERT (vec_len (new_list) <= TCP_MAX_SACK_BLOCKS);
+
+ /* Replace old vector with new one */
+ vec_reset_length (tc->snd_sacks);
+ tc->snd_sacks_fl = tc->snd_sacks;
+ tc->snd_sacks = new_list;
+
+ /* Segments should not 'touch' */
+ ASSERT (tcp_sack_vector_is_sane (tc->snd_sacks));
+}
+
+u32
+tcp_sack_list_bytes (tcp_connection_t * tc)
+{
+ u32 bytes = 0, i;
+ for (i = 0; i < vec_len (tc->snd_sacks); i++)
+ bytes += tc->snd_sacks[i].end - tc->snd_sacks[i].start;
+ return bytes;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/tcp/tcp_sack.h b/src/vnet/tcp/tcp_sack.h
new file mode 100644
index 0000000..1c3fa95
--- /dev/null
+++ b/src/vnet/tcp/tcp_sack.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2020 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SRC_VNET_TCP_TCP_SACK_H_
+#define SRC_VNET_TCP_TCP_SACK_H_
+
+#include <vnet/tcp/tcp_types.h>
+
+always_inline u32
+scoreboard_hole_index (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole)
+{
+ ASSERT (!pool_is_free_index (sb->holes, hole - sb->holes));
+ return hole - sb->holes;
+}
+
+always_inline u32
+scoreboard_hole_bytes (sack_scoreboard_hole_t * hole)
+{
+ return hole->end - hole->start;
+}
+
+always_inline sack_scoreboard_hole_t *
+scoreboard_get_hole (sack_scoreboard_t * sb, u32 index)
+{
+ if (index != TCP_INVALID_SACK_HOLE_INDEX)
+ return pool_elt_at_index (sb->holes, index);
+ return 0;
+}
+
+always_inline sack_scoreboard_hole_t *
+scoreboard_next_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole)
+{
+ if (hole->next != TCP_INVALID_SACK_HOLE_INDEX)
+ return pool_elt_at_index (sb->holes, hole->next);
+ return 0;
+}
+
+always_inline sack_scoreboard_hole_t *
+scoreboard_prev_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole)
+{
+ if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX)
+ return pool_elt_at_index (sb->holes, hole->prev);
+ return 0;
+}
+
+always_inline sack_scoreboard_hole_t *
+scoreboard_first_hole (sack_scoreboard_t * sb)
+{
+ if (sb->head != TCP_INVALID_SACK_HOLE_INDEX)
+ return pool_elt_at_index (sb->holes, sb->head);
+ return 0;
+}
+
+always_inline sack_scoreboard_hole_t *
+scoreboard_last_hole (sack_scoreboard_t * sb)
+{
+ if (sb->tail != TCP_INVALID_SACK_HOLE_INDEX)
+ return pool_elt_at_index (sb->holes, sb->tail);
+ return 0;
+}
+
+#if TCP_SCOREBOARD_TRACE
+#define tcp_scoreboard_trace_add(_tc, _ack) \
+{ \
+ static u64 _group = 0; \
+ sack_scoreboard_t *_sb = &_tc->sack_sb; \
+ sack_block_t *_sack, *_sacks; \
+ scoreboard_trace_elt_t *_elt; \
+ int i; \
+ _group++; \
+ _sacks = _tc->rcv_opts.sacks; \
+ for (i = 0; i < vec_len (_sacks); i++) \
+ { \
+ _sack = &_sacks[i]; \
+ vec_add2 (_sb->trace, _elt, 1); \
+ _elt->start = _sack->start; \
+ _elt->end = _sack->end; \
+ _elt->ack = _elt->end == _ack ? _ack : 0; \
+ _elt->snd_una_max = _elt->end == _ack ? _tc->snd_una_max : 0; \
+ _elt->group = _group; \
+ } \
+}
+#else
+#define tcp_scoreboard_trace_add(_tc, _ack)
+#endif
+
+sack_scoreboard_hole_t *scoreboard_next_rxt_hole (sack_scoreboard_t * sb,
+ sack_scoreboard_hole_t *
+ start, u8 have_sent_1_smss,
+ u8 * can_rescue,
+ u8 * snd_limited);
+void scoreboard_clear (sack_scoreboard_t * sb);
+void scoreboard_clear_reneging (sack_scoreboard_t * sb, u32 start, u32 end);
+void scoreboard_init (sack_scoreboard_t * sb);
+void scoreboard_init_rxt (sack_scoreboard_t * sb, u32 snd_una);
+
+format_function_t format_tcp_scoreboard;
+
+/* Made public for unit testing only */
+void tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end);
+u32 tcp_sack_list_bytes (tcp_connection_t * tc);
+void tcp_rcv_sacks (tcp_connection_t * tc, u32 ack);
+u8 *tcp_scoreboard_replay (u8 * s, tcp_connection_t * tc, u8 verbose);
+u8 tcp_scoreboard_is_sane_post_recovery (tcp_connection_t * tc);
+
+#endif /* SRC_VNET_TCP_TCP_SACK_H_ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/tcp/tcp_types.h b/src/vnet/tcp/tcp_types.h
new file mode 100644
index 0000000..ccb7ae8
--- /dev/null
+++ b/src/vnet/tcp/tcp_types.h
@@ -0,0 +1,451 @@
+/*
+ * Copyright (c) 2020 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SRC_VNET_TCP_TCP_TYPES_H_
+#define SRC_VNET_TCP_TCP_TYPES_H_
+
+#include <vppinfra/clib.h>
+#include <vppinfra/rbtree.h>
+#include <vnet/tcp/tcp_packet.h>
+#include <vnet/session/transport.h>
+
+#define TCP_TICK 0.001 /**< TCP tick period (s) */
+#define THZ (u32) (1/TCP_TICK) /**< TCP tick frequency */
+#define TCP_TSTAMP_RESOLUTION TCP_TICK /**< Time stamp resolution */
+#define TCP_PAWS_IDLE 24 * 24 * 60 * 60 * THZ /**< 24 days */
+#define TCP_FIB_RECHECK_PERIOD 1 * THZ /**< Recheck every 1s */
+#define TCP_MAX_OPTION_SPACE 40
+#define TCP_CC_DATA_SZ 24
+#define TCP_MAX_GSO_SZ 65536
+#define TCP_RXT_MAX_BURST 10
+
+#define TCP_DUPACK_THRESHOLD 3
+#define TCP_IW_N_SEGMENTS 10
+#define TCP_ALWAYS_ACK 1 /**< On/off delayed acks */
+#define TCP_USE_SACKS 1 /**< Disable only for testing */
+
+/** TCP FSM state definitions as per RFC793. */
+#define foreach_tcp_fsm_state \
+ _(CLOSED, "CLOSED") \
+ _(LISTEN, "LISTEN") \
+ _(SYN_SENT, "SYN_SENT") \
+ _(SYN_RCVD, "SYN_RCVD") \
+ _(ESTABLISHED, "ESTABLISHED") \
+ _(CLOSE_WAIT, "CLOSE_WAIT") \
+ _(FIN_WAIT_1, "FIN_WAIT_1") \
+ _(LAST_ACK, "LAST_ACK") \
+ _(CLOSING, "CLOSING") \
+ _(FIN_WAIT_2, "FIN_WAIT_2") \
+ _(TIME_WAIT, "TIME_WAIT")
+
+typedef enum _tcp_state
+{
+#define _(sym, str) TCP_STATE_##sym,
+ foreach_tcp_fsm_state
+#undef _
+ TCP_N_STATES
+} tcp_state_t;
+
+/** TCP timers */
+#define foreach_tcp_timer \
+ _(RETRANSMIT, "RETRANSMIT") \
+ _(DELACK, "DELAYED ACK") \
+ _(PERSIST, "PERSIST") \
+ _(WAITCLOSE, "WAIT CLOSE") \
+ _(RETRANSMIT_SYN, "RETRANSMIT SYN") \
+
+typedef enum _tcp_timers
+{
+#define _(sym, str) TCP_TIMER_##sym,
+ foreach_tcp_timer
+#undef _
+ TCP_N_TIMERS
+} tcp_timers_e;
+
+#define TCP_TIMER_HANDLE_INVALID ((u32) ~0)
+
+#define TCP_TIMER_TICK 0.1 /**< Timer tick in seconds */
+#define TCP_TO_TIMER_TICK TCP_TICK*10 /**< Factor for converting
+ ticks to timer ticks */
+
+#define TCP_RTO_MAX 60 * THZ /* Min max RTO (60s) as per RFC6298 */
+#define TCP_RTO_MIN 0.2 * THZ /* Min RTO (200ms) - lower than standard */
+#define TCP_RTT_MAX 30 * THZ /* 30s (probably too much) */
+#define TCP_RTO_SYN_RETRIES 3 /* SYN retries without doubling RTO */
+#define TCP_RTO_INIT 1 * THZ /* Initial retransmit timer */
+#define TCP_RTO_BOFF_MAX 8 /* Max number of retries before reset */
+#define TCP_ESTABLISH_TIME (60 * THZ) /* Connection establish timeout */
+
+/** Connection configuration flags */
+#define foreach_tcp_cfg_flag \
+ _(RATE_SAMPLE, "Rate sampling") \
+ _(NO_CSUM_OFFLOAD, "No csum offload") \
+ _(NO_TSO, "TSO off") \
+ _(TSO, "TSO") \
+ _(NO_ENDPOINT,"No endpoint") \
+
+typedef enum tcp_cfg_flag_bits_
+{
+#define _(sym, str) TCP_CFG_F_##sym##_BIT,
+ foreach_tcp_cfg_flag
+#undef _
+ TCP_CFG_N_FLAG_BITS
+} tcp_cfg_flag_bits_e;
+
+typedef enum tcp_cfg_flag_
+{
+#define _(sym, str) TCP_CFG_F_##sym = 1 << TCP_CFG_F_##sym##_BIT,
+ foreach_tcp_cfg_flag
+#undef _
+ TCP_CFG_N_FLAGS
+} tcp_cfg_flags_e;
+
+/** TCP connection flags */
+#define foreach_tcp_connection_flag \
+ _(SNDACK, "Send ACK") \
+ _(FINSNT, "FIN sent") \
+ _(RECOVERY, "Recovery") \
+ _(FAST_RECOVERY, "Fast Recovery") \
+ _(DCNT_PENDING, "Disconnect pending") \
+ _(HALF_OPEN_DONE, "Half-open completed") \
+ _(FINPNDG, "FIN pending") \
+ _(RXT_PENDING, "Retransmit pending") \
+ _(FRXT_FIRST, "Retransmit first") \
+ _(DEQ_PENDING, "Dequeue pending ") \
+ _(PSH_PENDING, "PSH pending") \
+ _(FINRCVD, "FIN received") \
+ _(ZERO_RWND_SENT, "Zero RWND sent") \
+
+typedef enum tcp_connection_flag_bits_
+{
+#define _(sym, str) TCP_CONN_##sym##_BIT,
+ foreach_tcp_connection_flag
+#undef _
+ TCP_CONN_N_FLAG_BITS
+} tcp_connection_flag_bits_e;
+
+typedef enum tcp_connection_flag_
+{
+#define _(sym, str) TCP_CONN_##sym = 1 << TCP_CONN_##sym##_BIT,
+ foreach_tcp_connection_flag
+#undef _
+ TCP_CONN_N_FLAGS
+} tcp_connection_flags_e;
+
+#define TCP_SCOREBOARD_TRACE (0)
+#define TCP_MAX_SACK_BLOCKS 256 /**< Max number of SACK blocks stored */
+#define TCP_INVALID_SACK_HOLE_INDEX ((u32)~0)
+
+typedef struct _scoreboard_trace_elt
+{
+ u32 start;
+ u32 end;
+ u32 ack;
+ u32 snd_una_max;
+ u32 group;
+} scoreboard_trace_elt_t;
+
+typedef struct _sack_scoreboard_hole
+{
+ u32 next; /**< Index for next entry in linked list */
+ u32 prev; /**< Index for previous entry in linked list */
+ u32 start; /**< Start sequence number */
+ u32 end; /**< End sequence number */
+ u8 is_lost; /**< Mark hole as lost */
+} sack_scoreboard_hole_t;
+
+typedef struct _sack_scoreboard
+{
+ sack_scoreboard_hole_t *holes; /**< Pool of holes */
+ u32 head; /**< Index of first entry */
+ u32 tail; /**< Index of last entry */
+ u32 sacked_bytes; /**< Number of bytes sacked in sb */
+ u32 last_sacked_bytes; /**< Number of bytes last sacked */
+ u32 last_bytes_delivered; /**< Sack bytes delivered to app */
+ u32 rxt_sacked; /**< Rxt bytes last delivered */
+ u32 high_sacked; /**< Highest byte sacked (fack) */
+ u32 high_rxt; /**< Highest retransmitted sequence */
+ u32 rescue_rxt; /**< Rescue sequence number */
+ u32 lost_bytes; /**< Bytes lost as per RFC6675 */
+ u32 last_lost_bytes; /**< Number of bytes last lost */
+ u32 cur_rxt_hole; /**< Retransmitting from this hole */
+ u8 is_reneging;
+
+#if TCP_SCOREBOARD_TRACE
+ scoreboard_trace_elt_t *trace;
+#endif
+
+} sack_scoreboard_t;
+
+#define TCP_BTS_INVALID_INDEX ((u32)~0)
+
+typedef enum tcp_bts_flags_
+{
+ TCP_BTS_IS_RXT = 1,
+ TCP_BTS_IS_APP_LIMITED = 1 << 1,
+ TCP_BTS_IS_SACKED = 1 << 2,
+ TCP_BTS_IS_RXT_LOST = 1 << 3,
+} __clib_packed tcp_bts_flags_t;
+
+typedef struct tcp_bt_sample_
+{
+ u32 next; /**< Next sample index in list */
+ u32 prev; /**< Previous sample index in list */
+ u32 min_seq; /**< Min seq number in sample */
+ u32 max_seq; /**< Max seq number. Set for rxt samples */
+ u64 delivered; /**< Total delivered bytes for sample */
+ f64 delivered_time; /**< Delivered time when sample taken */
+ f64 tx_time; /**< Transmit time for the burst */
+ f64 first_tx_time; /**< Connection first tx time at tx */
+ u64 tx_in_flight; /**< In flight at tx time */
+ u64 tx_lost; /**< Lost at tx time */
+ tcp_bts_flags_t flags; /**< Sample flag */
+} tcp_bt_sample_t;
+
+typedef struct tcp_rate_sample_
+{
+ u64 prior_delivered; /**< Delivered of sample used for rate, i.e.,
+ total bytes delivered at prior_time */
+ f64 prior_time; /**< Delivered time of sample used for rate */
+ f64 interval_time; /**< Time to ack the bytes delivered */
+ f64 rtt_time; /**< RTT for sample */
+ u64 tx_in_flight; /**< In flight at (re)transmit time */
+ u64 tx_lost; /**< Lost over interval */
+ u32 delivered; /**< Bytes delivered in interval_time */
+ u32 acked_and_sacked; /**< Bytes acked + sacked now */
+ u32 last_lost; /**< Bytes lost now */
+ u32 lost; /**< Number of bytes lost over interval */
+ tcp_bts_flags_t flags; /**< Rate sample flags from bt sample */
+} tcp_rate_sample_t;
+
+typedef struct tcp_byte_tracker_
+{
+ tcp_bt_sample_t *samples; /**< Pool of samples */
+ rb_tree_t sample_lookup; /**< Rbtree for sample lookup by min_seq */
+ u32 head; /**< Head of samples linked list */
+ u32 tail; /**< Tail of samples linked list */
+ u32 last_ooo; /**< Cached last ooo sample */
+} tcp_byte_tracker_t;
+
+typedef enum _tcp_cc_algorithm_type
+{
+ TCP_CC_NEWRENO,
+ TCP_CC_CUBIC,
+ TCP_CC_LAST = TCP_CC_CUBIC
+} tcp_cc_algorithm_type_e;
+
+typedef struct _tcp_cc_algorithm tcp_cc_algorithm_t;
+
+typedef enum _tcp_cc_ack_t
+{
+ TCP_CC_ACK,
+ TCP_CC_DUPACK,
+ TCP_CC_PARTIALACK
+} tcp_cc_ack_t;
+
+typedef enum tcp_cc_event_
+{
+ TCP_CC_EVT_START_TX,
+} tcp_cc_event_t;
+
+/*
+ * As per RFC4898 tcpEStatsStackSoftErrors
+ */
+typedef struct tcp_errors_
+{
+ u32 below_data_wnd; /**< All data in seg is below snd_una */
+ u32 above_data_wnd; /**< Some data in segment is above snd_wnd */
+ u32 below_ack_wnd; /**< Acks for data below snd_una */
+ u32 above_ack_wnd; /**< Acks for data not sent */
+} tcp_errors_t;
+
+typedef struct _tcp_connection
+{
+ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+ transport_connection_t connection; /**< Common transport data. First! */
+
+ u8 state; /**< TCP state as per tcp_state_t */
+ u8 cfg_flags; /**< Connection configuration flags */
+ u16 flags; /**< Connection flags (see tcp_conn_flags_e) */
+ u32 timers[TCP_N_TIMERS]; /**< Timer handles into timer wheel */
+
+ u64 segs_in; /** RFC4022/4898 tcpHCInSegs/tcpEStatsPerfSegsIn */
+ u64 bytes_in; /** RFC4898 tcpEStatsPerfHCDataOctetsIn */
+ u64 segs_out; /** RFC4898 tcpEStatsPerfSegsOut */
+ u64 bytes_out; /** RFC4898 tcpEStatsPerfHCDataOctetsOut */
+
+ /** Send sequence variables RFC793 */
+ u32 snd_una; /**< oldest unacknowledged sequence number */
+ u32 snd_una_max; /**< newest unacknowledged sequence number + 1*/
+ u32 snd_wnd; /**< send window */
+ u32 snd_wl1; /**< seq number used for last snd.wnd update */
+ u32 snd_wl2; /**< ack number used for last snd.wnd update */
+ u32 snd_nxt; /**< next seq number to be sent */
+ u16 snd_mss; /**< Effective send max seg (data) size */
+
+ u64 data_segs_in; /** RFC4898 tcpEStatsPerfDataSegsIn */
+ u64 data_segs_out; /** RFC4898 tcpEStatsPerfDataSegsOut */
+
+ /** Receive sequence variables RFC793 */
+ u32 rcv_nxt; /**< next sequence number expected */
+ u32 rcv_wnd; /**< receive window we expect */
+
+ u32 rcv_las; /**< rcv_nxt at last ack sent/rcv_wnd update */
+ u32 iss; /**< initial sent sequence */
+ u32 irs; /**< initial remote sequence */
+
+ /* Options */
+ u8 snd_opts_len; /**< Tx options len */
+ u8 rcv_wscale; /**< Window scale to advertise to peer */
+ u8 snd_wscale; /**< Window scale to use when sending */
+ u32 tsval_recent; /**< Last timestamp received */
+ u32 tsval_recent_age; /**< When last updated tstamp_recent*/
+ tcp_options_t snd_opts; /**< Tx options for connection */
+ tcp_options_t rcv_opts; /**< Rx options for connection */
+
+ sack_block_t *snd_sacks; /**< Vector of SACKs to send. XXX Fixed size? */
+ u8 snd_sack_pos; /**< Position in vec of first block to send */
+ sack_block_t *snd_sacks_fl; /**< Vector for building new list */
+ sack_scoreboard_t sack_sb; /**< SACK "scoreboard" that tracks holes */
+
+ u16 rcv_dupacks; /**< Number of recent DUPACKs received */
+ u32 dupacks_in; /**< RFC4898 tcpEStatsStackDupAcksIn*/
+ u8 pending_dupacks; /**< Number of DUPACKs to be sent */
+ u32 dupacks_out; /**< RFC4898 tcpEStatsPathDupAcksOut */
+
+ /* Congestion control */
+ u32 cwnd; /**< Congestion window */
+ u32 cwnd_acc_bytes; /**< Bytes accumulated for cwnd increment */
+ u32 ssthresh; /**< Slow-start threshold */
+ u32 prev_ssthresh; /**< ssthresh before congestion */
+ u32 prev_cwnd; /**< ssthresh before congestion */
+ u32 bytes_acked; /**< Bytes acknowledged by current segment */
+ u32 burst_acked; /**< Bytes acknowledged in current burst */
+ u32 snd_rxt_bytes; /**< Retransmitted bytes during current cc event */
+ u32 snd_rxt_ts; /**< Timestamp when first packet is retransmitted */
+ u32 prr_delivered; /**< RFC6937 bytes delivered during current event */
+ u32 prr_start; /**< snd_una when prr starts */
+ u32 rxt_delivered; /**< Rxt bytes delivered during current cc event */
+ u32 rxt_head; /**< snd_una last time we re rxted the head */
+ u32 tsecr_last_ack; /**< Timestamp echoed to us in last healthy ACK */
+ u32 snd_congestion; /**< snd_una_max when congestion is detected */
+ u32 tx_fifo_size; /**< Tx fifo size. Used to constrain cwnd */
+ tcp_cc_algorithm_t *cc_algo; /**< Congestion control algorithm */
+ u8 cc_data[TCP_CC_DATA_SZ]; /**< Congestion control algo private data */
+
+ u32 fr_occurences; /**< fast-retransmit occurrences RFC4898
+ tcpEStatsStackFastRetran */
+ u32 tr_occurences; /**< timer-retransmit occurrences */
+ u64 bytes_retrans; /**< RFC4898 tcpEStatsPerfOctetsRetrans */
+ u64 segs_retrans; /**< RFC4898 tcpEStatsPerfSegsRetrans*/
+
+ /* RTT and RTO */
+ u32 rto; /**< Retransmission timeout */
+ u32 rto_boff; /**< Index for RTO backoff */
+ u32 srtt; /**< Smoothed RTT */
+ u32 rttvar; /**< Smoothed mean RTT difference. Approximates variance */
+ u32 rtt_seq; /**< Sequence number for tracked ACK */
+ f64 rtt_ts; /**< Timestamp for tracked ACK */
+ f64 mrtt_us; /**< High precision mrtt from tracked acks */
+
+ u32 psh_seq; /**< Add psh header for seg that includes this */
+ u32 next_node_index; /**< Can be used to control next node in output */
+ u32 next_node_opaque; /**< Opaque to pass to next node */
+ u32 limited_transmit; /**< snd_nxt when limited transmit starts */
+ u32 sw_if_index; /**< Interface for the connection */
+
+ /* Delivery rate estimation */
+ u64 delivered; /**< Total bytes delivered to peer */
+ u64 app_limited; /**< Delivered when app-limited detected */
+ f64 delivered_time; /**< Time last bytes were acked */
+ f64 first_tx_time; /**< Send time for recently delivered/sent */
+ u64 lost; /**< Total bytes lost */
+ tcp_byte_tracker_t *bt; /**< Tx byte tracker */
+
+ tcp_errors_t errors; /**< Soft connection errors */
+
+ f64 start_ts; /**< Timestamp when connection initialized */
+ u32 last_fib_check; /**< Last time we checked fib route for peer */
+ u16 mss; /**< Our max seg size that includes options */
+ u32 timestamp_delta; /**< Offset for timestamp */
+ u32 ipv6_flow_label; /**< flow label for ipv6 header */
+
+#define rst_state snd_wl1
+} tcp_connection_t;
+
+/* *INDENT-OFF* */
+struct _tcp_cc_algorithm
+{
+ const char *name;
+ uword (*unformat_cfg) (unformat_input_t * input);
+ void (*init) (tcp_connection_t * tc);
+ void (*cleanup) (tcp_connection_t * tc);
+ void (*rcv_ack) (tcp_connection_t * tc, tcp_rate_sample_t *rs);
+ void (*rcv_cong_ack) (tcp_connection_t * tc, tcp_cc_ack_t ack,
+ tcp_rate_sample_t *rs);
+ void (*congestion) (tcp_connection_t * tc);
+ void (*loss) (tcp_connection_t * tc);
+ void (*recovered) (tcp_connection_t * tc);
+ void (*undo_recovery) (tcp_connection_t * tc);
+ void (*event) (tcp_connection_t *tc, tcp_cc_event_t evt);
+ u64 (*get_pacing_rate) (tcp_connection_t *tc);
+};
+/* *INDENT-ON* */
+
+#define tcp_fastrecovery_on(tc) (tc)->flags |= TCP_CONN_FAST_RECOVERY
+#define tcp_fastrecovery_off(tc) (tc)->flags &= ~TCP_CONN_FAST_RECOVERY
+#define tcp_recovery_on(tc) (tc)->flags |= TCP_CONN_RECOVERY
+#define tcp_recovery_off(tc) (tc)->flags &= ~TCP_CONN_RECOVERY
+#define tcp_in_fastrecovery(tc) ((tc)->flags & TCP_CONN_FAST_RECOVERY)
+#define tcp_in_recovery(tc) ((tc)->flags & (TCP_CONN_RECOVERY))
+#define tcp_in_slowstart(tc) (tc->cwnd < tc->ssthresh)
+#define tcp_disconnect_pending(tc) ((tc)->flags & TCP_CONN_DCNT_PENDING)
+#define tcp_disconnect_pending_on(tc) ((tc)->flags |= TCP_CONN_DCNT_PENDING)
+#define tcp_disconnect_pending_off(tc) ((tc)->flags &= ~TCP_CONN_DCNT_PENDING)
+#define tcp_fastrecovery_first(tc) ((tc)->flags & TCP_CONN_FRXT_FIRST)
+#define tcp_fastrecovery_first_on(tc) ((tc)->flags |= TCP_CONN_FRXT_FIRST)
+#define tcp_fastrecovery_first_off(tc) ((tc)->flags &= ~TCP_CONN_FRXT_FIRST)
+
+#define tcp_in_cong_recovery(tc) ((tc)->flags & \
+ (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY))
+
+always_inline void
+tcp_cong_recovery_off (tcp_connection_t * tc)
+{
+ tc->flags &= ~(TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY);
+ tcp_fastrecovery_first_off (tc);
+}
+
+#define tcp_csum_offload(tc) (!((tc)->cfg_flags & TCP_CFG_F_NO_CSUM_OFFLOAD))
+
+#define tcp_zero_rwnd_sent(tc) ((tc)->flags & TCP_CONN_ZERO_RWND_SENT)
+#define tcp_zero_rwnd_sent_on(tc) (tc)->flags |= TCP_CONN_ZERO_RWND_SENT
+#define tcp_zero_rwnd_sent_off(tc) (tc)->flags &= ~TCP_CONN_ZERO_RWND_SENT
+
+always_inline tcp_connection_t *
+tcp_get_connection_from_transport (transport_connection_t * tconn)
+{
+ return (tcp_connection_t *) tconn;
+}
+
+#endif /* SRC_VNET_TCP_TCP_TYPES_H_ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */