tap gso: experimental support

This commit adds a "gso" parameter to existing "create tap..." CLI,
and a "no-gso" parameter for the compatibility with the future,
when/if defaults change.

It makes use of the lowest bit of the "tap_flags" field in the API call
in order to allow creation of GSO interfaces via API as well.

It does the necessary syscalls to enable the GSO
and checksum offload support on the kernel side and sets two flags
on the interface: virtio-specific virtio_if_t.gso_enabled,
and vnet_hw_interface_t.flags & VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO.

The first one, if enabled, triggers the marking of the GSO-encapsulated
packets on ingress with VNET_BUFFER_F_GSO flag, and
setting vnet_buffer2(b)->gso_size to the desired L4 payload size.

VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO determines the egress packet
processing in interface-output for such packets:

When the flag is set, they are sent out almost as usual (just taking
care to set the vnet header for virtio).

When the flag is not enabled (the case for most interfaces),
the egress path performs the re-segmentation such that
the L4 payload of the transmitted packets equals gso_size.

The operations in the datapath are enabled only when there is at least
one GSO-compatible interface in the system - this is done by tracking
the count in interface_main.gso_interface_count. This way the impact
of conditional checks for the setups that do not use GSO is minimized.

"show tap" CLI shows the state of the GSO flag on the interface, and
the total count of GSO-enabled interfaces (which is used to enable
the GSO-related processing in the packet path).

This commit lacks IPv6 extension header traversal support of any kind -
the L4 payload is assumed to follow the IPv6 header. Also it performs
the offloads only for TCP (TSO - TCP segmentation offload).
The UDP fragmentation offload (UFO) is not part of it.

For debug purposes it also adds the debug CLI:

 "set tap gso {<interface> | sw_if_index <sw_idx>} <enable|disable>"

Change-Id: Ifd562db89adcc2208094b3d1032cee8c307aaef9
Signed-off-by: Andrew Yourtchenko <ayourtch@gmail.com>
diff --git a/src/vnet/interface_output.c b/src/vnet/interface_output.c
index beeb62a..251ff34 100644
--- a/src/vnet/interface_output.c
+++ b/src/vnet/interface_output.c
@@ -47,7 +47,10 @@
 typedef struct
 {
   u32 sw_if_index;
-  u8 data[128 - sizeof (u32)];
+  u32 flags;
+  u16 gso_size;
+  u8 gso_l4_hdr_sz;
+  u8 data[128 - 3 * sizeof (u32)];
 }
 interface_output_trace_t;
 
@@ -69,24 +72,30 @@
 	  (vnm->interface_main.sw_interfaces, t->sw_if_index))
 	{
 	  /* the interface may have been deleted by the time the trace is printed */
-	  s = format (s, "sw_if_index: %d\n%U%U",
-		      t->sw_if_index,
-		      format_white_space, indent,
-		      node->format_buffer ? node->
-		      format_buffer : format_hex_bytes, t->data,
-		      sizeof (t->data));
+	  s = format (s, "sw_if_index: %d ", t->sw_if_index);
 	}
       else
 	{
 	  si = vnet_get_sw_interface (vnm, t->sw_if_index);
-
-	  s = format (s, "%U\n%U%U",
-		      format_vnet_sw_interface_name, vnm, si,
-		      format_white_space, indent,
-		      node->format_buffer ? node->
-		      format_buffer : format_hex_bytes, t->data,
-		      sizeof (t->data));
+	  s =
+	    format (s, "%U ", format_vnet_sw_interface_name, vnm, si,
+		    t->flags);
 	}
+#define _(bit, name, v, x) \
+          if (v && (t->flags & VNET_BUFFER_F_##name)) \
+            s = format (s, "%s ", v);
+      foreach_vnet_buffer_flag
+#undef _
+	if (t->flags & VNET_BUFFER_F_GSO)
+	{
+	  s = format (s, "\n%Ugso_sz %d gso_l4_hdr_sz %d",
+		      format_white_space, indent + 2, t->gso_size,
+		      t->gso_l4_hdr_sz);
+	}
+      s =
+	format (s, "\n%U%U", format_white_space, indent,
+		node->format_buffer ? node->format_buffer : format_hex_bytes,
+		t->data, sizeof (t->data));
     }
   return s;
 }
@@ -121,6 +130,9 @@
 	{
 	  t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
 	  t0->sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_TX];
+	  t0->flags = b0->flags;
+	  t0->gso_size = vnet_buffer2 (b0)->gso_size;
+	  t0->gso_l4_hdr_sz = vnet_buffer2 (b0)->gso_l4_hdr_sz;
 	  clib_memcpy_fast (t0->data, vlib_buffer_get_current (b0),
 			    sizeof (t0->data));
 	}
@@ -128,6 +140,9 @@
 	{
 	  t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0]));
 	  t1->sw_if_index = vnet_buffer (b1)->sw_if_index[VLIB_TX];
+	  t1->flags = b1->flags;
+	  t1->gso_size = vnet_buffer2 (b1)->gso_size;
+	  t1->gso_l4_hdr_sz = vnet_buffer2 (b1)->gso_l4_hdr_sz;
 	  clib_memcpy_fast (t1->data, vlib_buffer_get_current (b1),
 			    sizeof (t1->data));
 	}
@@ -149,6 +164,9 @@
 	{
 	  t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
 	  t0->sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_TX];
+	  t0->flags = b0->flags;
+	  t0->gso_size = vnet_buffer2 (b0)->gso_size;
+	  t0->gso_l4_hdr_sz = vnet_buffer2 (b0)->gso_l4_hdr_sz;
 	  clib_memcpy_fast (t0->data, vlib_buffer_get_current (b0),
 			    sizeof (t0->data));
 	}
@@ -192,9 +210,17 @@
     {
       int bogus;
       if (b->flags & VNET_BUFFER_F_OFFLOAD_TCP_CKSUM)
-	th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ip6, &bogus);
+	{
+	  th->checksum = 0;
+	  th->checksum =
+	    ip6_tcp_udp_icmp_compute_checksum (vm, b, ip6, &bogus);
+	}
       if (b->flags & VNET_BUFFER_F_OFFLOAD_UDP_CKSUM)
-	uh->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ip6, &bogus);
+	{
+	  uh->checksum = 0;
+	  uh->checksum =
+	    ip6_tcp_udp_icmp_compute_checksum (vm, b, ip6, &bogus);
+	}
     }
 
   b->flags &= ~VNET_BUFFER_F_OFFLOAD_TCP_CKSUM;
@@ -202,12 +228,245 @@
   b->flags &= ~VNET_BUFFER_F_OFFLOAD_IP_CKSUM;
 }
 
+static_always_inline u16
+tso_alloc_tx_bufs (vlib_main_t * vm,
+		   vnet_interface_per_thread_data_t * ptd,
+		   vlib_buffer_t * b0, u16 l4_hdr_sz)
+{
+  u32 n_bytes_b0 = vlib_buffer_length_in_chain (vm, b0);
+  u16 gso_size = vnet_buffer2 (b0)->gso_size;
+  u16 l234_sz = vnet_buffer (b0)->l4_hdr_offset + l4_hdr_sz;
+  /* rounded-up division */
+  u16 n_bufs = (n_bytes_b0 - l234_sz + (gso_size - 1)) / gso_size;
+  u16 n_alloc;
+
+  ASSERT (n_bufs > 0);
+  vec_validate (ptd->split_buffers, n_bufs - 1);
+
+  n_alloc = vlib_buffer_alloc (vm, ptd->split_buffers, n_bufs);
+  if (n_alloc < n_bufs)
+    {
+      vlib_buffer_free (vm, ptd->split_buffers, n_alloc);
+      return 0;
+    }
+  return 1;
+}
+
+static_always_inline void
+tso_init_buf_from_template_base (vlib_buffer_t * nb0, vlib_buffer_t * b0,
+				 u32 flags, u16 length)
+{
+  nb0->current_data = 0;
+  nb0->total_length_not_including_first_buffer = 0;
+  nb0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID | flags;
+  clib_memcpy_fast (&nb0->opaque, &b0->opaque, sizeof (nb0->opaque));
+  clib_memcpy_fast (nb0->data, b0->data, length);
+  nb0->current_length = length;
+}
+
+static_always_inline void
+tso_init_buf_from_template (vlib_main_t * vm, vlib_buffer_t * nb0,
+			    vlib_buffer_t * b0, u16 template_data_sz,
+			    u16 gso_size, u8 ** p_dst_ptr, u16 * p_dst_left,
+			    u32 next_tcp_seq, u32 flags)
+{
+  tso_init_buf_from_template_base (nb0, b0, flags, template_data_sz);
+
+  *p_dst_left =
+    clib_min (gso_size,
+	      vlib_buffer_get_default_data_size (vm) - template_data_sz);
+  *p_dst_ptr = nb0->data + template_data_sz;
+
+  tcp_header_t *tcp =
+    (tcp_header_t *) (nb0->data + vnet_buffer (nb0)->l4_hdr_offset);
+  tcp->seq_number = clib_host_to_net_u32 (next_tcp_seq);
+}
+
+static_always_inline void
+tso_fixup_segmented_buf (vlib_buffer_t * b0, u8 tcp_flags, int is_ip6)
+{
+  u16 l3_hdr_offset = vnet_buffer (b0)->l3_hdr_offset;
+  u16 l4_hdr_offset = vnet_buffer (b0)->l4_hdr_offset;
+  ip4_header_t *ip4 = (ip4_header_t *) (b0->data + l3_hdr_offset);
+  ip6_header_t *ip6 = (ip6_header_t *) (b0->data + l3_hdr_offset);
+  tcp_header_t *tcp = (tcp_header_t *) (b0->data + l4_hdr_offset);
+
+  tcp->flags = tcp_flags;
+
+  if (is_ip6)
+    ip6->payload_length =
+      clib_host_to_net_u16 (b0->current_length -
+			    vnet_buffer (b0)->l4_hdr_offset);
+  else
+    ip4->length =
+      clib_host_to_net_u16 (b0->current_length -
+			    vnet_buffer (b0)->l3_hdr_offset);
+}
+
+/**
+ * Allocate the necessary number of ptd->split_buffers,
+ * and segment the possibly chained buffer(s) from b0 into
+ * there.
+ *
+ * Return the cumulative number of bytes sent or zero
+ * if allocation failed.
+ */
+
+static_always_inline u32
+tso_segment_buffer (vlib_main_t * vm, vnet_interface_per_thread_data_t * ptd,
+		    int do_tx_offloads, u32 sbi0, vlib_buffer_t * sb0,
+		    u32 n_bytes_b0)
+{
+  u32 n_tx_bytes = 0;
+  int is_ip4 = sb0->flags & VNET_BUFFER_F_IS_IP4;
+  int is_ip6 = sb0->flags & VNET_BUFFER_F_IS_IP6;
+  ASSERT (is_ip4 || is_ip6);
+  ASSERT (sb0->flags & VNET_BUFFER_F_L2_HDR_OFFSET_VALID);
+  ASSERT (sb0->flags & VNET_BUFFER_F_L3_HDR_OFFSET_VALID);
+  ASSERT (sb0->flags & VNET_BUFFER_F_L4_HDR_OFFSET_VALID);
+  u16 gso_size = vnet_buffer2 (sb0)->gso_size;
+
+  int l4_hdr_sz = vnet_buffer2 (sb0)->gso_l4_hdr_sz;
+  u8 save_tcp_flags = 0;
+  u8 tcp_flags_no_fin_psh = 0;
+  u32 next_tcp_seq = 0;
+
+  tcp_header_t *tcp =
+    (tcp_header_t *) (sb0->data + vnet_buffer (sb0)->l4_hdr_offset);
+  next_tcp_seq = clib_net_to_host_u32 (tcp->seq_number);
+  /* store original flags for last packet and reset FIN and PSH */
+  save_tcp_flags = tcp->flags;
+  tcp_flags_no_fin_psh = tcp->flags & ~(TCP_FLAG_FIN | TCP_FLAG_PSH);
+  tcp->checksum = 0;
+
+  u32 default_bflags =
+    sb0->flags & ~(VNET_BUFFER_F_GSO | VLIB_BUFFER_NEXT_PRESENT);
+  u16 l234_sz = vnet_buffer (sb0)->l4_hdr_offset + l4_hdr_sz;
+  int first_data_size = clib_min (gso_size, sb0->current_length - l234_sz);
+  next_tcp_seq += first_data_size;
+
+  if (PREDICT_FALSE (!tso_alloc_tx_bufs (vm, ptd, sb0, l4_hdr_sz)))
+    return 0;
+
+  vlib_buffer_t *b0 = vlib_get_buffer (vm, ptd->split_buffers[0]);
+  tso_init_buf_from_template_base (b0, sb0, default_bflags,
+				   l4_hdr_sz + first_data_size);
+
+  u32 total_src_left = n_bytes_b0 - l234_sz - first_data_size;
+  if (total_src_left)
+    {
+      /* Need to copy more segments */
+      u8 *src_ptr, *dst_ptr;
+      u16 src_left, dst_left;
+      /* current source buffer */
+      vlib_buffer_t *csb0 = sb0;
+      u32 csbi0 = sbi0;
+      /* current dest buffer */
+      vlib_buffer_t *cdb0;
+      u16 dbi = 1;		/* the buffer [0] is b0 */
+
+      src_ptr = sb0->data + l234_sz + first_data_size;
+      src_left = sb0->current_length - l234_sz - first_data_size;
+      b0->current_length = l234_sz + first_data_size;
+
+      tso_fixup_segmented_buf (b0, tcp_flags_no_fin_psh, is_ip6);
+      if (do_tx_offloads)
+	calc_checksums (vm, b0);
+
+      /* grab a second buffer and prepare the loop */
+      ASSERT (dbi < vec_len (ptd->split_buffers));
+      cdb0 = vlib_get_buffer (vm, ptd->split_buffers[dbi++]);
+      tso_init_buf_from_template (vm, cdb0, b0, l234_sz, gso_size, &dst_ptr,
+				  &dst_left, next_tcp_seq, default_bflags);
+
+      /* an arbitrary large number to catch the runaway loops */
+      int nloops = 2000;
+      while (total_src_left)
+	{
+	  ASSERT (nloops-- > 0);
+	  u16 bytes_to_copy = clib_min (src_left, dst_left);
+
+	  clib_memcpy_fast (dst_ptr, src_ptr, bytes_to_copy);
+
+	  src_left -= bytes_to_copy;
+	  src_ptr += bytes_to_copy;
+	  total_src_left -= bytes_to_copy;
+	  dst_left -= bytes_to_copy;
+	  dst_ptr += bytes_to_copy;
+	  next_tcp_seq += bytes_to_copy;
+	  cdb0->current_length += bytes_to_copy;
+
+	  if (0 == src_left)
+	    {
+	      int has_next = (csb0->flags & VLIB_BUFFER_NEXT_PRESENT);
+	      u32 next_bi = csb0->next_buffer;
+
+	      /* init src to the next buffer in chain */
+	      if (has_next)
+		{
+		  csbi0 = next_bi;
+		  csb0 = vlib_get_buffer (vm, csbi0);
+		  src_left = csb0->current_length;
+		  src_ptr = csb0->data;
+		}
+	      else
+		{
+		  ASSERT (total_src_left == 0);
+		  break;
+		}
+	    }
+	  if (0 == dst_left && total_src_left)
+	    {
+	      if (do_tx_offloads)
+		calc_checksums (vm, cdb0);
+	      n_tx_bytes += cdb0->current_length;
+	      ASSERT (dbi < vec_len (ptd->split_buffers));
+	      cdb0 = vlib_get_buffer (vm, ptd->split_buffers[dbi++]);
+	      tso_init_buf_from_template (vm, cdb0, b0, l234_sz,
+					  gso_size, &dst_ptr, &dst_left,
+					  next_tcp_seq, default_bflags);
+	    }
+	}
+
+      tso_fixup_segmented_buf (cdb0, save_tcp_flags, is_ip6);
+      if (do_tx_offloads)
+	calc_checksums (vm, cdb0);
+
+      n_tx_bytes += cdb0->current_length;
+    }
+  n_tx_bytes += b0->current_length;
+  return n_tx_bytes;
+}
+
+static_always_inline void
+drop_one_buffer_and_count (vlib_main_t * vm, vnet_main_t * vnm,
+			   vlib_node_runtime_t * node, u32 * pbi0,
+			   u32 drop_error_code)
+{
+  u32 thread_index = vm->thread_index;
+  vnet_interface_output_runtime_t *rt = (void *) node->runtime_data;
+
+  vlib_simple_counter_main_t *cm;
+  cm =
+    vec_elt_at_index (vnm->interface_main.sw_if_counters,
+		      VNET_INTERFACE_COUNTER_TX_ERROR);
+  vlib_increment_simple_counter (cm, thread_index, rt->sw_if_index, 1);
+
+  vlib_error_drop_buffers (vm, node, pbi0,
+			   /* buffer stride */ 1,
+			   /* n_buffers */ 1,
+			   VNET_INTERFACE_OUTPUT_NEXT_DROP,
+			   node->node_index, drop_error_code);
+}
+
 static_always_inline uword
-vnet_interface_output_node_inline (vlib_main_t * vm,
-				   vlib_node_runtime_t * node,
-				   vlib_frame_t * frame, vnet_main_t * vnm,
-				   vnet_hw_interface_t * hi,
-				   int do_tx_offloads)
+vnet_interface_output_node_inline_gso (vlib_main_t * vm,
+				       vlib_node_runtime_t * node,
+				       vlib_frame_t * frame,
+				       vnet_main_t * vnm,
+				       vnet_hw_interface_t * hi,
+				       int do_tx_offloads,
+				       int do_segmentation)
 {
   vnet_interface_output_runtime_t *rt = (void *) node->runtime_data;
   vnet_sw_interface_t *si;
@@ -219,6 +478,8 @@
   u32 next_index = VNET_INTERFACE_OUTPUT_NEXT_TX;
   u32 current_config_index = ~0;
   u8 arc = im->output_feature_arc_index;
+  vnet_interface_per_thread_data_t *ptd =
+    vec_elt_at_index (im->per_thread_data, thread_index);
 
   n_buffers = frame->n_vectors;
 
@@ -300,15 +561,30 @@
 	  to_tx[1] = bi1;
 	  to_tx[2] = bi2;
 	  to_tx[3] = bi3;
-	  from += 4;
-	  to_tx += 4;
-	  n_left_to_tx -= 4;
+	  if (!do_segmentation)
+	    {
+	      from += 4;
+	      to_tx += 4;
+	      n_left_to_tx -= 4;
+	    }
 
 	  b0 = vlib_get_buffer (vm, bi0);
 	  b1 = vlib_get_buffer (vm, bi1);
 	  b2 = vlib_get_buffer (vm, bi2);
 	  b3 = vlib_get_buffer (vm, bi3);
 
+	  if (do_segmentation)
+	    {
+	      or_flags = b0->flags | b1->flags | b2->flags | b3->flags;
+
+	      /* go to single loop if we need TSO segmentation */
+	      if (PREDICT_FALSE (or_flags & VNET_BUFFER_F_GSO))
+		break;
+	      from += 4;
+	      to_tx += 4;
+	      n_left_to_tx -= 4;
+	    }
+
 	  /* Be grumpy about zero length buffers for benefit of
 	     driver tx function. */
 	  ASSERT (b0->current_length > 0);
@@ -376,7 +652,8 @@
 					       n_bytes_b3);
 	    }
 
-	  or_flags = b0->flags | b1->flags | b2->flags | b3->flags;
+	  if (!do_segmentation)
+	    or_flags = b0->flags | b1->flags | b2->flags | b3->flags;
 
 	  if (do_tx_offloads)
 	    {
@@ -422,6 +699,85 @@
 	      b0->current_config_index = current_config_index;
 	    }
 
+	  if (do_segmentation)
+	    {
+	      if (PREDICT_FALSE (b0->flags & VNET_BUFFER_F_GSO))
+		{
+		  /*
+		   * Undo the enqueue of the b0 - it is not going anywhere,
+		   * and will be freed either after it's segmented or
+		   * when dropped, if there is no buffers to segment into.
+		   */
+		  to_tx -= 1;
+		  n_left_to_tx += 1;
+		  /* undo the counting. */
+		  n_bytes -= n_bytes_b0;
+		  n_packets -= 1;
+
+		  u32 n_tx_bytes = 0;
+
+		  n_tx_bytes =
+		    tso_segment_buffer (vm, ptd, do_tx_offloads, bi0, b0,
+					n_bytes_b0);
+
+		  if (PREDICT_FALSE (n_tx_bytes == 0))
+		    {
+		      drop_one_buffer_and_count (vm, vnm, node, from - 1,
+						 VNET_INTERFACE_OUTPUT_ERROR_NO_BUFFERS_FOR_GSO);
+		      continue;
+		    }
+
+		  u16 n_tx_bufs = vec_len (ptd->split_buffers);
+		  u32 *from_tx_seg = ptd->split_buffers;
+
+		  while (n_tx_bufs > 0)
+		    {
+		      if (n_tx_bufs >= n_left_to_tx)
+			{
+			  while (n_left_to_tx > 0)
+			    {
+			      to_tx[0] = from_tx_seg[0];
+			      to_tx += 1;
+			      from_tx_seg += 1;
+			      n_left_to_tx -= 1;
+			      n_tx_bufs -= 1;
+			      n_packets += 1;
+			    }
+			  vlib_put_next_frame (vm, node, next_index,
+					       n_left_to_tx);
+			  vlib_get_new_next_frame (vm, node, next_index,
+						   to_tx, n_left_to_tx);
+			}
+		      else
+			{
+			  while (n_tx_bufs > 0)
+			    {
+			      to_tx[0] = from_tx_seg[0];
+			      to_tx += 1;
+			      from_tx_seg += 1;
+			      n_left_to_tx -= 1;
+			      n_tx_bufs -= 1;
+			      n_packets += 1;
+			    }
+			}
+		    }
+		  n_bytes += n_tx_bytes;
+		  if (PREDICT_FALSE (tx_swif0 != rt->sw_if_index))
+		    {
+
+		      vlib_increment_combined_counter
+			(im->combined_sw_if_counters +
+			 VNET_INTERFACE_COUNTER_TX, thread_index, tx_swif0,
+			 _vec_len (ptd->split_buffers), n_tx_bytes);
+		    }
+		  /* The buffers were enqueued. Reset the length */
+		  _vec_len (ptd->split_buffers) = 0;
+		  /* Free the now segmented buffer */
+		  vlib_buffer_free_one (vm, bi0);
+		  continue;
+		}
+	    }
+
 	  if (PREDICT_FALSE (tx_swif0 != rt->sw_if_index))
 	    {
 
@@ -446,6 +802,33 @@
   return n_buffers;
 }
 
+static_always_inline uword
+vnet_interface_output_node_inline (vlib_main_t * vm,
+				   vlib_node_runtime_t * node,
+				   vlib_frame_t * frame, vnet_main_t * vnm,
+				   vnet_hw_interface_t * hi,
+				   int do_tx_offloads)
+{
+  /*
+   * The 3-headed "if" is here because we want to err on the side
+   * of not impacting the non-GSO performance - so for the more
+   * common case of no GSO interfaces we want to prevent the
+   * segmentation codepath from being there altogether.
+   */
+  if (PREDICT_TRUE (vnm->interface_main.gso_interface_count == 0))
+    return vnet_interface_output_node_inline_gso (vm, node, frame, vnm, hi,
+						  do_tx_offloads,
+						  /* do_segmentation */ 0);
+  else if (hi->flags & VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO)
+    return vnet_interface_output_node_inline_gso (vm, node, frame, vnm, hi,
+						  do_tx_offloads,
+						  /* do_segmentation */ 0);
+  else
+    return vnet_interface_output_node_inline_gso (vm, node, frame, vnm, hi,
+						  do_tx_offloads,
+						  /* do_segmentation */ 1);
+}
+
 uword
 vnet_interface_output_node (vlib_main_t * vm, vlib_node_runtime_t * node,
 			    vlib_frame_t * frame)