tap gso: experimental support

This commit adds a "gso" parameter to existing "create tap..." CLI,
and a "no-gso" parameter for the compatibility with the future,
when/if defaults change.

It makes use of the lowest bit of the "tap_flags" field in the API call
in order to allow creation of GSO interfaces via API as well.

It does the necessary syscalls to enable the GSO
and checksum offload support on the kernel side and sets two flags
on the interface: virtio-specific virtio_if_t.gso_enabled,
and vnet_hw_interface_t.flags & VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO.

The first one, if enabled, triggers the marking of the GSO-encapsulated
packets on ingress with VNET_BUFFER_F_GSO flag, and
setting vnet_buffer2(b)->gso_size to the desired L4 payload size.

VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO determines the egress packet
processing in interface-output for such packets:

When the flag is set, they are sent out almost as usual (just taking
care to set the vnet header for virtio).

When the flag is not enabled (the case for most interfaces),
the egress path performs the re-segmentation such that
the L4 payload of the transmitted packets equals gso_size.

The operations in the datapath are enabled only when there is at least
one GSO-compatible interface in the system - this is done by tracking
the count in interface_main.gso_interface_count. This way the impact
of conditional checks for the setups that do not use GSO is minimized.

"show tap" CLI shows the state of the GSO flag on the interface, and
the total count of GSO-enabled interfaces (which is used to enable
the GSO-related processing in the packet path).

This commit lacks IPv6 extension header traversal support of any kind -
the L4 payload is assumed to follow the IPv6 header. Also it performs
the offloads only for TCP (TSO - TCP segmentation offload).
The UDP fragmentation offload (UFO) is not part of it.

For debug purposes it also adds the debug CLI:

 "set tap gso {<interface> | sw_if_index <sw_idx>} <enable|disable>"

Change-Id: Ifd562db89adcc2208094b3d1032cee8c307aaef9
Signed-off-by: Andrew Yourtchenko <ayourtch@gmail.com>
diff --git a/src/vnet/devices/tap/cli.c b/src/vnet/devices/tap/cli.c
index ee57a72..084fb90 100644
--- a/src/vnet/devices/tap/cli.c
+++ b/src/vnet/devices/tap/cli.c
@@ -39,6 +39,7 @@
   int ip_addr_set = 0;
 
   args.id = ~0;
+  args.tap_flags = 0;
 
   /* Get a line of input. */
   if (unformat_user (input, unformat_line_input, line_input))
@@ -75,6 +76,10 @@
 	    ;
 	  else if (unformat (line_input, "tx-ring-size %d", &args.tx_ring_sz))
 	    ;
+	  else if (unformat (line_input, "no-gso"))
+	    args.tap_flags &= ~TAP_FLAG_GSO;
+	  else if (unformat (line_input, "gso"))
+	    args.tap_flags |= TAP_FLAG_GSO;
 	  else if (unformat (line_input, "hw-addr %U",
 			     unformat_ethernet_address, args.mac_addr))
 	    args.mac_addr_set = 1;
@@ -109,7 +114,7 @@
     "[rx-ring-size <size>] [tx-ring-size <size>] [host-ns <netns>] "
     "[host-bridge <bridge-name>] [host-ip4-addr <ip4addr/mask>] "
     "[host-ip6-addr <ip6-addr>] [host-ip4-gw <ip4-addr>] "
-    "[host-ip6-gw <ip6-addr>] [host-if-name <name>]",
+    "[host-ip6-gw <ip6-addr>] [host-if-name <name>] [no-gso|gso]",
   .function = tap_create_command_fn,
 };
 /* *INDENT-ON* */
@@ -163,6 +168,59 @@
 /* *INDENT-ON* */
 
 static clib_error_t *
+tap_gso_command_fn (vlib_main_t * vm, unformat_input_t * input,
+		    vlib_cli_command_t * cmd)
+{
+  unformat_input_t _line_input, *line_input = &_line_input;
+  u32 sw_if_index = ~0;
+  vnet_main_t *vnm = vnet_get_main ();
+  int enable = 1;
+  int rv;
+
+  /* Get a line of input. */
+  if (!unformat_user (input, unformat_line_input, line_input))
+    return clib_error_return (0, "Missing <interface>");
+
+  while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (line_input, "sw_if_index %d", &sw_if_index))
+	;
+      else if (unformat (line_input, "%U", unformat_vnet_sw_interface,
+			 vnm, &sw_if_index))
+	;
+      else if (unformat (line_input, "enable"))
+	enable = 1;
+      else if (unformat (line_input, "disable"))
+	enable = 0;
+      else
+	return clib_error_return (0, "unknown input `%U'",
+				  format_unformat_error, input);
+    }
+  unformat_free (line_input);
+
+  if (sw_if_index == ~0)
+    return clib_error_return (0,
+			      "please specify interface name or sw_if_index");
+
+  rv = tap_gso_enable_disable (vm, sw_if_index, enable);
+  if (rv == VNET_API_ERROR_INVALID_SW_IF_INDEX)
+    return clib_error_return (0, "not a tap interface");
+  else if (rv != 0)
+    return clib_error_return (0, "error on configuring GSO on tap interface");
+
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (tap_gso__command, static) =
+{
+  .path = "set tap gso",
+  .short_help = "set tap gso {<interface> | sw_if_index <sw_idx>} <enable|disable>",
+  .function = tap_gso_command_fn,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
 tap_show_command_fn (vlib_main_t * vm, unformat_input_t * input,
 		     vlib_cli_command_t * cmd)
 {
diff --git a/src/vnet/devices/tap/tap.c b/src/vnet/devices/tap/tap.c
index 101576c..3739561 100644
--- a/src/vnet/devices/tap/tap.c
+++ b/src/vnet/devices/tap/tap.c
@@ -176,6 +176,16 @@
 
   unsigned int offload = 0;
   hdrsz = sizeof (struct virtio_net_hdr_v1);
+  if (args->tap_flags & TAP_FLAG_GSO)
+    {
+      offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6;
+      vif->gso_enabled = 1;
+    }
+  else
+    {
+      vif->gso_enabled = 0;
+    }
+
   _IOCTL (vif->tap_fd, TUNSETOFFLOAD, offload);
   _IOCTL (vif->tap_fd, TUNSETVNETHDRSZ, &hdrsz);
   _IOCTL (vif->fd, VHOST_SET_OWNER, 0);
@@ -386,6 +396,11 @@
   args->sw_if_index = vif->sw_if_index;
   hw = vnet_get_hw_interface (vnm, vif->hw_if_index);
   hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_INT_MODE;
+  if (args->tap_flags & TAP_FLAG_GSO)
+    {
+      hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO;
+      vnm->interface_main.gso_interface_count++;
+    }
   vnet_hw_interface_set_input_node (vnm, vif->hw_if_index,
 				    virtio_input_node.index);
   vnet_hw_interface_assign_rx_thread (vnm, vif->hw_if_index, 0, ~0);
@@ -442,6 +457,10 @@
   if (vif->type != VIRTIO_IF_TYPE_TAP)
     return VNET_API_ERROR_INVALID_INTERFACE;
 
+  /* decrement if this was a GSO interface */
+  if (hw->flags & VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO)
+    vnm->interface_main.gso_interface_count--;
+
   /* bring down the interface */
   vnet_hw_interface_set_flags (vnm, vif->hw_if_index, 0);
   vnet_sw_interface_set_flags (vnm, vif->sw_if_index, 0);
@@ -467,6 +486,52 @@
 }
 
 int
+tap_gso_enable_disable (vlib_main_t * vm, u32 sw_if_index, int enable_disable)
+{
+  vnet_main_t *vnm = vnet_get_main ();
+  virtio_main_t *mm = &virtio_main;
+  virtio_if_t *vif;
+  vnet_hw_interface_t *hw = vnet_get_sup_hw_interface (vnm, sw_if_index);
+  clib_error_t *err = 0;
+
+  if (hw == NULL || virtio_device_class.index != hw->dev_class_index)
+    return VNET_API_ERROR_INVALID_SW_IF_INDEX;
+
+  vif = pool_elt_at_index (mm->interfaces, hw->dev_instance);
+
+  const unsigned int gso_on = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6;
+  const unsigned int gso_off = 0;
+  unsigned int offload = enable_disable ? gso_on : gso_off;
+  _IOCTL (vif->tap_fd, TUNSETOFFLOAD, offload);
+  vif->gso_enabled = enable_disable ? 1 : 0;
+  if (enable_disable)
+    {
+      if ((hw->flags & VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO) == 0)
+	{
+	  vnm->interface_main.gso_interface_count++;
+	  hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO;
+	}
+    }
+  else
+    {
+      if ((hw->flags & VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO) != 0)
+	{
+	  vnm->interface_main.gso_interface_count--;
+	  hw->flags &= ~VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO;
+	}
+    }
+
+error:
+  if (err)
+    {
+      clib_warning ("Error %s gso on sw_if_index %d",
+		    enable_disable ? "enabling" : "disabling", sw_if_index);
+      return VNET_API_ERROR_SYSCALL_ERROR_3;
+    }
+  return 0;
+}
+
+int
 tap_dump_ifs (tap_interface_details_t ** out_tapids)
 {
   vnet_main_t *vnm = vnet_get_main ();
diff --git a/src/vnet/devices/tap/tap.h b/src/vnet/devices/tap/tap.h
index 19dc88d..745f9fc 100644
--- a/src/vnet/devices/tap/tap.h
+++ b/src/vnet/devices/tap/tap.h
@@ -30,6 +30,7 @@
   u16 rx_ring_sz;
   u16 tx_ring_sz;
   u32 tap_flags;
+#define TAP_FLAG_GSO (1 << 0)
   u8 *host_namespace;
   u8 *host_if_name;
   u8 host_mac_addr[6];
@@ -78,6 +79,8 @@
 
 void tap_create_if (vlib_main_t * vm, tap_create_if_args_t * args);
 int tap_delete_if (vlib_main_t * vm, u32 sw_if_index);
+int tap_gso_enable_disable (vlib_main_t * vm, u32 sw_if_index,
+			    int enable_disable);
 int tap_dump_ifs (tap_interface_details_t ** out_tapids);
 
 #endif /* _VNET_DEVICES_VIRTIO_TAP_H_ */
diff --git a/src/vnet/devices/virtio/device.c b/src/vnet/devices/virtio/device.c
index aa6a342..609ffb4 100644
--- a/src/vnet/devices/virtio/device.c
+++ b/src/vnet/devices/virtio/device.c
@@ -117,7 +117,7 @@
 static_always_inline u16
 add_buffer_to_slot (vlib_main_t * vm, virtio_if_t * vif,
 		    virtio_vring_t * vring, u32 bi, u16 avail, u16 next,
-		    u16 mask)
+		    u16 mask, int do_gso)
 {
   u16 n_added = 0;
   int hdr_sz = vif->virtio_net_hdr_sz;
@@ -127,6 +127,25 @@
   struct virtio_net_hdr_v1 *hdr = vlib_buffer_get_current (b) - hdr_sz;
 
   clib_memset (hdr, 0, hdr_sz);
+  if (do_gso && (b->flags & VNET_BUFFER_F_GSO))
+    {
+      if (b->flags & VNET_BUFFER_F_IS_IP4)
+	{
+	  hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+	  hdr->gso_size = vnet_buffer2 (b)->gso_size;
+	  hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+	  hdr->csum_start = vnet_buffer (b)->l4_hdr_offset;	// 0x22;
+	  hdr->csum_offset = 0x10;
+	}
+      else
+	{
+	  hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+	  hdr->gso_size = vnet_buffer2 (b)->gso_size;
+	  hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+	  hdr->csum_start = vnet_buffer (b)->l4_hdr_offset;	// 0x36;
+	  hdr->csum_offset = 0x10;
+	}
+    }
 
   if (PREDICT_TRUE ((b->flags & VLIB_BUFFER_NEXT_PRESENT) == 0))
     {
@@ -219,7 +238,8 @@
 
 static_always_inline uword
 virtio_interface_tx_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
-			    vlib_frame_t * frame, virtio_if_t * vif)
+			    vlib_frame_t * frame, virtio_if_t * vif,
+			    int do_gso)
 {
   u8 qid = 0;
   u16 n_left = frame->n_vectors;
@@ -246,7 +266,8 @@
     {
       u16 n_added = 0;
       n_added =
-	add_buffer_to_slot (vm, vif, vring, buffers[0], avail, next, mask);
+	add_buffer_to_slot (vm, vif, vring, buffers[0], avail, next, mask,
+			    do_gso);
       if (!n_added)
 	break;
       avail += n_added;
@@ -286,7 +307,12 @@
   vnet_interface_output_runtime_t *rund = (void *) node->runtime_data;
   virtio_if_t *vif = pool_elt_at_index (nm->interfaces, rund->dev_instance);
 
-  return virtio_interface_tx_inline (vm, node, frame, vif);
+  vnet_main_t *vnm = vnet_get_main ();
+  if (vnm->interface_main.gso_interface_count > 0)
+    return virtio_interface_tx_inline (vm, node, frame, vif, 1 /* do_gso */ );
+  else
+    return virtio_interface_tx_inline (vm, node, frame, vif,
+				       0 /* no do_gso */ );
 }
 
 static void
diff --git a/src/vnet/devices/virtio/node.c b/src/vnet/devices/virtio/node.c
index 6b82c41..fcc0f8a 100644
--- a/src/vnet/devices/virtio/node.c
+++ b/src/vnet/devices/virtio/node.c
@@ -30,6 +30,7 @@
 #include <vnet/feature/feature.h>
 #include <vnet/ip/ip4_packet.h>
 #include <vnet/ip/ip6_packet.h>
+#include <vnet/udp/udp_packet.h>
 #include <vnet/devices/virtio/virtio.h>
 
 
@@ -140,9 +141,86 @@
   goto more;
 }
 
+static_always_inline void
+fill_gso_buffer_flags (vlib_buffer_t * b0, struct virtio_net_hdr_v1 *hdr)
+{
+  u8 l4_proto = 0;
+  u8 l4_hdr_sz = 0;
+  if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)
+
+    {
+      ethernet_header_t *eh = (ethernet_header_t *) b0->data;
+      u16 ethertype = clib_net_to_host_u16 (eh->type);
+      u16 l2hdr_sz = sizeof (ethernet_header_t);
+
+      vnet_buffer (b0)->l2_hdr_offset = 0;
+      vnet_buffer (b0)->l3_hdr_offset = l2hdr_sz;
+      if (PREDICT_TRUE (ethertype == ETHERNET_TYPE_IP4))
+	{
+	  ip4_header_t *ip4 = (ip4_header_t *) (b0->data + l2hdr_sz);
+	  vnet_buffer (b0)->l4_hdr_offset = l2hdr_sz + ip4_header_bytes (ip4);
+	  l4_proto = ip4->protocol;
+	  b0->flags |=
+	    (VNET_BUFFER_F_IS_IP4 | VNET_BUFFER_F_L2_HDR_OFFSET_VALID
+	     | VNET_BUFFER_F_L3_HDR_OFFSET_VALID |
+	     VNET_BUFFER_F_L4_HDR_OFFSET_VALID);
+	  b0->flags |= VNET_BUFFER_F_OFFLOAD_IP_CKSUM;
+	}
+      else if (PREDICT_TRUE (ethertype == ETHERNET_TYPE_IP6))
+	{
+	  ip6_header_t *ip6 = (ip6_header_t *) (b0->data + l2hdr_sz);
+	  /* FIXME IPv6 EH traversal */
+	  vnet_buffer (b0)->l4_hdr_offset = l2hdr_sz + sizeof (ip6_header_t);
+	  l4_proto = ip6->protocol;
+	  b0->flags |=
+	    (VNET_BUFFER_F_IS_IP6 | VNET_BUFFER_F_L2_HDR_OFFSET_VALID
+	     | VNET_BUFFER_F_L3_HDR_OFFSET_VALID |
+	     VNET_BUFFER_F_L4_HDR_OFFSET_VALID);
+	  b0->flags |= VNET_BUFFER_F_OFFLOAD_IP_CKSUM;
+	}
+      if (l4_proto == IP_PROTOCOL_TCP)
+	{
+	  b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM;
+	  tcp_header_t *tcp = (tcp_header_t *) (b0->data +
+						vnet_buffer
+						(b0)->l4_hdr_offset);
+	  l4_hdr_sz = tcp_header_bytes (tcp);
+	  tcp->checksum = 0;
+	}
+      else if (l4_proto == IP_PROTOCOL_UDP)
+	{
+	  b0->flags |= VNET_BUFFER_F_OFFLOAD_UDP_CKSUM;
+	  udp_header_t *udp = (udp_header_t *) (b0->data +
+						vnet_buffer
+						(b0)->l4_hdr_offset);
+	  l4_hdr_sz = sizeof (*udp);
+	  udp->checksum = 0;
+	}
+    }
+
+  if (hdr->gso_type == VIRTIO_NET_HDR_GSO_TCPV4)
+    {
+      ASSERT (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM);
+      vnet_buffer2 (b0)->gso_size = hdr->gso_size;
+      vnet_buffer2 (b0)->gso_l4_hdr_sz = l4_hdr_sz;
+      b0->flags |= VNET_BUFFER_F_GSO;
+      b0->flags |= VNET_BUFFER_F_IS_IP4;
+    }
+  if (hdr->gso_type == VIRTIO_NET_HDR_GSO_TCPV6)
+    {
+      ASSERT (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM);
+      vnet_buffer2 (b0)->gso_size = hdr->gso_size;
+      vnet_buffer2 (b0)->gso_l4_hdr_sz = l4_hdr_sz;
+      b0->flags |= VNET_BUFFER_F_GSO;
+      b0->flags |= VNET_BUFFER_F_IS_IP6;
+    }
+}
+
+
 static_always_inline uword
 virtio_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
-			    vlib_frame_t * frame, virtio_if_t * vif, u16 qid)
+			    vlib_frame_t * frame, virtio_if_t * vif, u16 qid,
+			    int gso_enabled)
 {
   vnet_main_t *vnm = vnet_get_main ();
   u32 thread_index = vm->thread_index;
@@ -187,6 +265,10 @@
 	  b0->current_length = len;
 	  b0->total_length_not_including_first_buffer = 0;
 	  b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
+
+	  if (gso_enabled)
+	    fill_gso_buffer_flags (b0, hdr);
+
 	  vnet_buffer (b0)->sw_if_index[VLIB_RX] = vif->sw_if_index;
 	  vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0;
 
@@ -286,8 +368,12 @@
     mif = vec_elt_at_index (nm->interfaces, dq->dev_instance);
     if (mif->flags & VIRTIO_IF_FLAG_ADMIN_UP)
       {
-	n_rx += virtio_device_input_inline (vm, node, frame, mif,
-					    dq->queue_id);
+	if (mif->gso_enabled)
+	  n_rx += virtio_device_input_inline (vm, node, frame, mif,
+					      dq->queue_id, 1);
+	else
+	  n_rx += virtio_device_input_inline (vm, node, frame, mif,
+					      dq->queue_id, 0);
       }
   }
 
diff --git a/src/vnet/devices/virtio/virtio.c b/src/vnet/devices/virtio/virtio.c
index cfeb302..2648f29 100644
--- a/src/vnet/devices/virtio/virtio.c
+++ b/src/vnet/devices/virtio/virtio.c
@@ -277,6 +277,7 @@
 	    vlib_cli_output (vm, "  host-ns \"%s\"", vif->net_ns);
 	  vlib_cli_output (vm, "  fd %d", vif->fd);
 	  vlib_cli_output (vm, "  tap-fd %d", vif->tap_fd);
+	  vlib_cli_output (vm, "  gso-enabled %d", vif->gso_enabled);
 	}
       vlib_cli_output (vm, "  Mac Address: %U", format_ethernet_address,
 		       vif->mac_addr);
diff --git a/src/vnet/devices/virtio/virtio.h b/src/vnet/devices/virtio/virtio.h
index af61ca5..f728196 100644
--- a/src/vnet/devices/virtio/virtio.h
+++ b/src/vnet/devices/virtio/virtio.h
@@ -173,6 +173,7 @@
   u8 host_ip4_prefix_len;
   ip6_address_t host_ip6_addr;
   u8 host_ip6_prefix_len;
+  int gso_enabled;
   int ifindex;
 } virtio_if_t;