ip: enhance vtep4_check of tunnel by vector way

This patch aims to improve decap performance by reducing expensive
hash_get callings as less as possible using AVX512 on XEON.
e.g. vxlan, vxlan_gpe, geneve, gtpu.

For the existing code, if vtep4 of the current packet match the last
vtep4_key_t well, expensive hash computation can be avoided and the
code returns directly.

This patch improves tunnel decap multiple flows case greatly by
leveraging 512bit vector register on XEON accommodating 8 vtep4_keys.
It enhances the possiblity of avoiding unnecessary hash computing
once hash key of the current packet hits any one of 8 in the 512bit
cache.

The oldest element in vtep4_cache_t is updated in round-robin order.

vlib_get_buffers is also leveraged in the meanwhile.

Type: improvement

Signed-off-by: Zhiyong Yang <zhiyong.yang@intel.com>
Signed-off-by: Ray Kinsella <mdr@ashroe.eu>
Signed-off-by: Junfeng Wang <drenfong.wang@intel.com>
Change-Id: I313103202bd76f2dd638cd942554721b37ddad60
diff --git a/src/vnet/geneve/decap.c b/src/vnet/geneve/decap.c
index 10a17ce..b570e35 100644
--- a/src/vnet/geneve/decap.c
+++ b/src/vnet/geneve/decap.c
@@ -869,11 +869,18 @@
 				   matching a local VTEP address */
   vtep6_key_t last_vtep6;	/* last IPv6 address / fib index
 				   matching a local VTEP address */
+  vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs;
+#ifdef CLIB_HAVE_VEC512
+  vtep4_cache_t vtep4_u512;
+  clib_memset (&vtep4_u512, 0, sizeof (vtep4_u512));
+#endif
 
   from = vlib_frame_vector_args (frame);
   n_left_from = frame->n_vectors;
   next_index = node->cached_next_index;
 
+  vlib_get_buffers (vm, from, bufs, n_left_from);
+
   if (node->flags & VLIB_NODE_FLAG_TRACE)
     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
 
@@ -900,16 +907,11 @@
 
 	  /* Prefetch next iteration. */
 	  {
-	    vlib_buffer_t *p2, *p3;
+	    vlib_prefetch_buffer_header (b[2], LOAD);
+	    vlib_prefetch_buffer_header (b[3], LOAD);
 
-	    p2 = vlib_get_buffer (vm, from[2]);
-	    p3 = vlib_get_buffer (vm, from[3]);
-
-	    vlib_prefetch_buffer_header (p2, LOAD);
-	    vlib_prefetch_buffer_header (p3, LOAD);
-
-	    CLIB_PREFETCH (p2->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
-	    CLIB_PREFETCH (p3->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
+	    CLIB_PREFETCH (b[2]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
+	    CLIB_PREFETCH (b[3]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
 	  }
 
 	  bi0 = to_next[0] = from[0];
@@ -919,8 +921,9 @@
 	  to_next += 2;
 	  n_left_to_next -= 2;
 
-	  b0 = vlib_get_buffer (vm, bi0);
-	  b1 = vlib_get_buffer (vm, bi1);
+	  b0 = b[0];
+	  b1 = b[1];
+	  b += 2;
 	  if (is_ip4)
 	    {
 	      ip40 = vlib_buffer_get_current (b0);
@@ -964,7 +967,12 @@
 	  /* Validate DIP against VTEPs */
 	  if (is_ip4)
 	    {
+#ifdef CLIB_HAVE_VEC512
+	      if (!vtep4_check_vector
+		  (&vxm->vtep_table, b0, ip40, &last_vtep4, &vtep4_u512))
+#else
 	      if (!vtep4_check (&vxm->vtep_table, b0, ip40, &last_vtep4))
+#endif
 		goto exit0;	/* no local VTEP for GENEVE packet */
 	    }
 	  else
@@ -1042,7 +1050,12 @@
 	  /* Validate DIP against VTEPs */
 	  if (is_ip4)
 	    {
+#ifdef CLIB_HAVE_VEC512
+	      if (!vtep4_check_vector
+		  (&vxm->vtep_table, b1, ip41, &last_vtep4, &vtep4_u512))
+#else
 	      if (!vtep4_check (&vxm->vtep_table, b1, ip41, &last_vtep4))
+#endif
 		goto exit1;	/* no local VTEP for GENEVE packet */
 	    }
 	  else
@@ -1126,7 +1139,8 @@
 	  to_next += 1;
 	  n_left_to_next -= 1;
 
-	  b0 = vlib_get_buffer (vm, bi0);
+	  b0 = b[0];
+	  b++;
 	  if (is_ip4)
 	    ip40 = vlib_buffer_get_current (b0);
 	  else
@@ -1156,7 +1170,12 @@
 	  /* Validate DIP against VTEPs */
 	  if (is_ip4)
 	    {
+#ifdef CLIB_HAVE_VEC512
+	      if (!vtep4_check_vector
+		  (&vxm->vtep_table, b0, ip40, &last_vtep4, &vtep4_u512))
+#else
 	      if (!vtep4_check (&vxm->vtep_table, b0, ip40, &last_vtep4))
+#endif
 		goto exit;	/* no local VTEP for GENEVE packet */
 	    }
 	  else