vxlan-gpe: improve encap performance

This patch improves performance by prefetching encap header area
and taking full advantage of optimized function vlib_get_buffers.
After applying the patch, the function vxlan_gpe_encap can save
4.1 clocks/pkt from 41.7 to 37.6 clocks/pkt on Skylake.

Change-Id: I85d486b21a2524d64f2e246dfb4183539ec2532d
Signed-off-by: Zhiyong Yang <zhiyong.yang@intel.com>
diff --git a/src/vnet/vxlan-gpe/encap.c b/src/vnet/vxlan-gpe/encap.c
index 9006b1b..1cca415 100644
--- a/src/vnet/vxlan-gpe/encap.c
+++ b/src/vnet/vxlan-gpe/encap.c
@@ -153,6 +153,7 @@
   u32 pkts_encapsulated = 0;
   u32 thread_index = vm->thread_index;
   u32 stats_sw_if_index, stats_n_packets, stats_n_bytes;
+  vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs;
 
   from = vlib_frame_vector_args (from_frame);
   n_left_from = from_frame->n_vectors;
@@ -160,6 +161,7 @@
   next_index = node->cached_next_index;
   stats_sw_if_index = node->runtime_data[0];
   stats_n_packets = stats_n_bytes = 0;
+  vlib_get_buffers (vm, from, bufs, n_left_from);
 
   while (n_left_from > 0)
     {
@@ -174,23 +176,19 @@
       while (n_left_from >= 4 && n_left_to_next >= 2)
 	{
 	  u32 bi0, bi1;
-	  vlib_buffer_t *b0, *b1;
 	  u32 next0, next1;
 
 	  next0 = next1 = VXLAN_GPE_ENCAP_NEXT_IP4_LOOKUP;
 
 	  /* Prefetch next iteration. */
 	  {
-	    vlib_buffer_t *p2, *p3;
+	    vlib_prefetch_buffer_header (b[2], LOAD);
+	    vlib_prefetch_buffer_header (b[3], LOAD);
 
-	    p2 = vlib_get_buffer (vm, from[2]);
-	    p3 = vlib_get_buffer (vm, from[3]);
-
-	    vlib_prefetch_buffer_header (p2, LOAD);
-	    vlib_prefetch_buffer_header (p3, LOAD);
-
-	    CLIB_PREFETCH (p2->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
-	    CLIB_PREFETCH (p3->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
+	    CLIB_PREFETCH (b[2]->data - CLIB_CACHE_LINE_BYTES,
+			   2 * CLIB_CACHE_LINE_BYTES, LOAD);
+	    CLIB_PREFETCH (b[3]->data - CLIB_CACHE_LINE_BYTES,
+			   2 * CLIB_CACHE_LINE_BYTES, LOAD);
 	  }
 
 	  bi0 = from[0];
@@ -202,25 +200,22 @@
 	  n_left_to_next -= 2;
 	  n_left_from -= 2;
 
-	  b0 = vlib_get_buffer (vm, bi0);
-	  b1 = vlib_get_buffer (vm, bi1);
-
 	  /* get the flag "is_ip4" */
-	  if (sw_if_index0 != vnet_buffer (b0)->sw_if_index[VLIB_TX])
+	  if (sw_if_index0 != vnet_buffer (b[0])->sw_if_index[VLIB_TX])
 	    {
-	      sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_TX];
+	      sw_if_index0 = vnet_buffer (b[0])->sw_if_index[VLIB_TX];
 	      hi0 =
 		vnet_get_sup_hw_interface (vnm,
-					   vnet_buffer (b0)->sw_if_index
+					   vnet_buffer (b[0])->sw_if_index
 					   [VLIB_TX]);
 	      t0 = pool_elt_at_index (ngm->tunnels, hi0->dev_instance);
 	      is_ip4_0 = (t0->flags & VXLAN_GPE_TUNNEL_IS_IPV4);
 	    }
 
 	  /* get the flag "is_ip4" */
-	  if (sw_if_index1 != vnet_buffer (b1)->sw_if_index[VLIB_TX])
+	  if (sw_if_index1 != vnet_buffer (b[1])->sw_if_index[VLIB_TX])
 	    {
-	      if (sw_if_index0 == vnet_buffer (b1)->sw_if_index[VLIB_TX])
+	      if (sw_if_index0 == vnet_buffer (b[1])->sw_if_index[VLIB_TX])
 		{
 		  sw_if_index1 = sw_if_index0;
 		  hi1 = hi0;
@@ -229,10 +224,10 @@
 		}
 	      else
 		{
-		  sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_TX];
+		  sw_if_index1 = vnet_buffer (b[1])->sw_if_index[VLIB_TX];
 		  hi1 =
 		    vnet_get_sup_hw_interface (vnm,
-					       vnet_buffer (b1)->sw_if_index
+					       vnet_buffer (b[1])->sw_if_index
 					       [VLIB_TX]);
 		  t1 = pool_elt_at_index (ngm->tunnels, hi1->dev_instance);
 		  is_ip4_1 = (t1->flags & VXLAN_GPE_TUNNEL_IS_IPV4);
@@ -241,24 +236,24 @@
 
 	  if (PREDICT_TRUE (is_ip4_0 == is_ip4_1))
 	    {
-	      vxlan_gpe_encap_two_inline (ngm, b0, b1, t0, t1, &next0, &next1,
-					  is_ip4_0);
+	      vxlan_gpe_encap_two_inline (ngm, b[0], b[1], t0, t1, &next0,
+					  &next1, is_ip4_0);
 	    }
 	  else
 	    {
-	      vxlan_gpe_encap_one_inline (ngm, b0, t0, &next0, is_ip4_0);
-	      vxlan_gpe_encap_one_inline (ngm, b1, t1, &next1, is_ip4_1);
+	      vxlan_gpe_encap_one_inline (ngm, b[0], t0, &next0, is_ip4_0);
+	      vxlan_gpe_encap_one_inline (ngm, b[1], t1, &next1, is_ip4_1);
 	    }
 
 	  /* Reset to look up tunnel partner in the configured FIB */
-	  vnet_buffer (b0)->sw_if_index[VLIB_TX] = t0->encap_fib_index;
-	  vnet_buffer (b1)->sw_if_index[VLIB_TX] = t1->encap_fib_index;
-	  vnet_buffer (b0)->sw_if_index[VLIB_RX] = sw_if_index0;
-	  vnet_buffer (b1)->sw_if_index[VLIB_RX] = sw_if_index1;
+	  vnet_buffer (b[0])->sw_if_index[VLIB_TX] = t0->encap_fib_index;
+	  vnet_buffer (b[1])->sw_if_index[VLIB_TX] = t1->encap_fib_index;
+	  vnet_buffer (b[0])->sw_if_index[VLIB_RX] = sw_if_index0;
+	  vnet_buffer (b[1])->sw_if_index[VLIB_RX] = sw_if_index1;
 	  pkts_encapsulated += 2;
 
-	  len0 = vlib_buffer_length_in_chain (vm, b0);
-	  len1 = vlib_buffer_length_in_chain (vm, b1);
+	  len0 = vlib_buffer_length_in_chain (vm, b[0]);
+	  len1 = vlib_buffer_length_in_chain (vm, b[1]);
 	  stats_n_packets += 2;
 	  stats_n_bytes += len0 + len1;
 
@@ -297,19 +292,20 @@
 		}
 	    }
 
-	  if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
+	  if (PREDICT_FALSE (b[0]->flags & VLIB_BUFFER_IS_TRACED))
 	    {
 	      vxlan_gpe_encap_trace_t *tr =
-		vlib_add_trace (vm, node, b0, sizeof (*tr));
+		vlib_add_trace (vm, node, b[0], sizeof (*tr));
 	      tr->tunnel_index = t0 - ngm->tunnels;
 	    }
 
-	  if (PREDICT_FALSE (b1->flags & VLIB_BUFFER_IS_TRACED))
+	  if (PREDICT_FALSE (b[1]->flags & VLIB_BUFFER_IS_TRACED))
 	    {
-	      vxlan_gpe_encap_trace_t *tr = vlib_add_trace (vm, node, b1,
+	      vxlan_gpe_encap_trace_t *tr = vlib_add_trace (vm, node, b[1],
 							    sizeof (*tr));
 	      tr->tunnel_index = t1 - ngm->tunnels;
 	    }
+	  b += 2;
 
 	  vlib_validate_buffer_enqueue_x2 (vm, node, next_index, to_next,
 					   n_left_to_next, bi0, bi1, next0,
@@ -319,7 +315,6 @@
       while (n_left_from > 0 && n_left_to_next > 0)
 	{
 	  u32 bi0;
-	  vlib_buffer_t *b0;
 	  u32 next0 = VXLAN_GPE_ENCAP_NEXT_IP4_LOOKUP;
 
 	  bi0 = from[0];
@@ -329,15 +324,13 @@
 	  n_left_from -= 1;
 	  n_left_to_next -= 1;
 
-	  b0 = vlib_get_buffer (vm, bi0);
-
 	  /* get the flag "is_ip4" */
-	  if (sw_if_index0 != vnet_buffer (b0)->sw_if_index[VLIB_TX])
+	  if (sw_if_index0 != vnet_buffer (b[0])->sw_if_index[VLIB_TX])
 	    {
-	      sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_TX];
+	      sw_if_index0 = vnet_buffer (b[0])->sw_if_index[VLIB_TX];
 	      hi0 =
 		vnet_get_sup_hw_interface (vnm,
-					   vnet_buffer (b0)->sw_if_index
+					   vnet_buffer (b[0])->sw_if_index
 					   [VLIB_TX]);
 
 	      t0 = pool_elt_at_index (ngm->tunnels, hi0->dev_instance);
@@ -345,14 +338,14 @@
 	      is_ip4_0 = (t0->flags & VXLAN_GPE_TUNNEL_IS_IPV4);
 	    }
 
-	  vxlan_gpe_encap_one_inline (ngm, b0, t0, &next0, is_ip4_0);
+	  vxlan_gpe_encap_one_inline (ngm, b[0], t0, &next0, is_ip4_0);
 
 	  /* Reset to look up tunnel partner in the configured FIB */
-	  vnet_buffer (b0)->sw_if_index[VLIB_TX] = t0->encap_fib_index;
-	  vnet_buffer (b0)->sw_if_index[VLIB_RX] = sw_if_index0;
+	  vnet_buffer (b[0])->sw_if_index[VLIB_TX] = t0->encap_fib_index;
+	  vnet_buffer (b[0])->sw_if_index[VLIB_RX] = sw_if_index0;
 	  pkts_encapsulated++;
 
-	  len0 = vlib_buffer_length_in_chain (vm, b0);
+	  len0 = vlib_buffer_length_in_chain (vm, b[0]);
 	  stats_n_packets += 1;
 	  stats_n_bytes += len0;
 
@@ -375,12 +368,14 @@
 	      stats_n_bytes = len0;
 	      stats_sw_if_index = sw_if_index0;
 	    }
-	  if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
+	  if (PREDICT_FALSE (b[0]->flags & VLIB_BUFFER_IS_TRACED))
 	    {
-	      vxlan_gpe_encap_trace_t *tr = vlib_add_trace (vm, node, b0,
+	      vxlan_gpe_encap_trace_t *tr = vlib_add_trace (vm, node, b[0],
 							    sizeof (*tr));
 	      tr->tunnel_index = t0 - ngm->tunnels;
 	    }
+	  b += 1;
+
 	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
 					   n_left_to_next, bi0, next0);
 	}