src/examples/sample-plugin/sample/node.c - fdio/vpp - Gitiles

 /*
  * Copyright (c) 2015 Cisco and/or its affiliates.
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at:
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 #include <vlib/vlib.h>
 #include <vnet/vnet.h>
 #include <vnet/pg/pg.h>
 #include <vnet/ethernet/ethernet.h>
 #include <vppinfra/error.h>
 #include <sample/sample.h>

 typedef struct
 {
   u32 next_index;
   u32 sw_if_index;
   u8 new_src_mac[6];
   u8 new_dst_mac[6];
 } sample_trace_t;


 /* packet trace format function */
 static u8 *
 format_sample_trace (u8 * s, va_list * args)
 {
   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
   sample_trace_t *t = va_arg (*args, sample_trace_t *);

   s = format (s, "SAMPLE: sw_if_index %d, next index %d\n",
 	      t->sw_if_index, t->next_index);
   s = format (s, "  new src %U -> new dst %U",
 	      format_mac_address, t->new_src_mac,
 	      format_mac_address, t->new_dst_mac);

   return s;
 }

 extern vlib_node_registration_t sample_node;

 #define foreach_sample_error \
 _(SWAPPED, "Mac swap packets processed")

 typedef enum
 {
 #define _(sym,str) SAMPLE_ERROR_##sym,
   foreach_sample_error
 #undef _
     SAMPLE_N_ERROR,
 } sample_error_t;

 static char *sample_error_strings[] = {
 #define _(sym,string) string,
   foreach_sample_error
 #undef _
 };

 typedef enum
 {
   SAMPLE_NEXT_INTERFACE_OUTPUT,
   SAMPLE_N_NEXT,
 } sample_next_t;

 /*
  * Simple dual/single loop version, default version which will compile
  * everywhere.
  *
  * Node costs 30 clocks/pkt at a vector size of 51
  */

 #define VERSION_1 1
 #ifdef VERSION_1
 #define foreach_mac_address_offset              \
 _(0)                                            \
 _(1)                                            \
 _(2)                                            \
 _(3)                                            \
 _(4)                                            \
 _(5)

 VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
 			    vlib_frame_t * frame)
 {
   u32 n_left_from, *from, *to_next;
   sample_next_t next_index;
   u32 pkts_swapped = 0;

   from = vlib_frame_vector_args (frame);
   n_left_from = frame->n_vectors;
   next_index = node->cached_next_index;

   while (n_left_from > 0)
     {
       u32 n_left_to_next;

       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);

       while (n_left_from >= 4 && n_left_to_next >= 2)
 	{
 	  u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
 	  u32 next1 = SAMPLE_NEXT_INTERFACE_OUTPUT;
 	  u32 sw_if_index0, sw_if_index1;
 	  u8 tmp0[6], tmp1[6];
 	  ethernet_header_t *en0, *en1;
 	  u32 bi0, bi1;
 	  vlib_buffer_t *b0, *b1;

 	  /* Prefetch next iteration. */
 	  {
 	    vlib_buffer_t *p2, *p3;

 	    p2 = vlib_get_buffer (vm, from[2]);
 	    p3 = vlib_get_buffer (vm, from[3]);

 	    vlib_prefetch_buffer_header (p2, LOAD);
 	    vlib_prefetch_buffer_header (p3, LOAD);

 	    CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
 	    CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
 	  }

 	  /* speculatively enqueue b0 and b1 to the current next frame */
 	  to_next[0] = bi0 = from[0];
 	  to_next[1] = bi1 = from[1];
 	  from += 2;
 	  to_next += 2;
 	  n_left_from -= 2;
 	  n_left_to_next -= 2;

 	  b0 = vlib_get_buffer (vm, bi0);
 	  b1 = vlib_get_buffer (vm, bi1);

 	  ASSERT (b0->current_data == 0);
 	  ASSERT (b1->current_data == 0);

 	  en0 = vlib_buffer_get_current (b0);
 	  en1 = vlib_buffer_get_current (b1);

 	  /* This is not the fastest way to swap src + dst mac addresses */
 #define _(a) tmp0[a] = en0->src_address[a];
 	  foreach_mac_address_offset;
 #undef _
 #define _(a) en0->src_address[a] = en0->dst_address[a];
 	  foreach_mac_address_offset;
 #undef _
 #define _(a) en0->dst_address[a] = tmp0[a];
 	  foreach_mac_address_offset;
 #undef _

 #define _(a) tmp1[a] = en1->src_address[a];
 	  foreach_mac_address_offset;
 #undef _
 #define _(a) en1->src_address[a] = en1->dst_address[a];
 	  foreach_mac_address_offset;
 #undef _
 #define _(a) en1->dst_address[a] = tmp1[a];
 	  foreach_mac_address_offset;
 #undef _

 	  sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
 	  sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX];

 	  /* Send pkt back out the RX interface */
 	  vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
 	  vnet_buffer (b1)->sw_if_index[VLIB_TX] = sw_if_index1;

 	  pkts_swapped += 2;

 	  if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
 	    {
 	      if (b0->flags & VLIB_BUFFER_IS_TRACED)
 		{
 		  sample_trace_t *t =
 		    vlib_add_trace (vm, node, b0, sizeof (*t));
 		  t->sw_if_index = sw_if_index0;
 		  t->next_index = next0;
 		  clib_memcpy_fast (t->new_src_mac, en0->src_address,
 				    sizeof (t->new_src_mac));
 		  clib_memcpy_fast (t->new_dst_mac, en0->dst_address,
 				    sizeof (t->new_dst_mac));

 		}
 	      if (b1->flags & VLIB_BUFFER_IS_TRACED)
 		{
 		  sample_trace_t *t =
 		    vlib_add_trace (vm, node, b1, sizeof (*t));
 		  t->sw_if_index = sw_if_index1;
 		  t->next_index = next1;
 		  clib_memcpy_fast (t->new_src_mac, en1->src_address,
 				    sizeof (t->new_src_mac));
 		  clib_memcpy_fast (t->new_dst_mac, en1->dst_address,
 				    sizeof (t->new_dst_mac));
 		}
 	    }

 	  /* verify speculative enqueues, maybe switch current next frame */
 	  vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
 					   to_next, n_left_to_next,
 					   bi0, bi1, next0, next1);
 	}

       while (n_left_from > 0 && n_left_to_next > 0)
 	{
 	  u32 bi0;
 	  vlib_buffer_t *b0;
 	  u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
 	  u32 sw_if_index0;
 	  u8 tmp0[6];
 	  ethernet_header_t *en0;

 	  /* speculatively enqueue b0 to the current next frame */
 	  bi0 = from[0];
 	  to_next[0] = bi0;
 	  from += 1;
 	  to_next += 1;
 	  n_left_from -= 1;
 	  n_left_to_next -= 1;

 	  b0 = vlib_get_buffer (vm, bi0);
 	  /*
 	   * Direct from the driver, we should be at offset 0
 	   * aka at &b0->data[0]
 	   */
 	  ASSERT (b0->current_data == 0);

 	  en0 = vlib_buffer_get_current (b0);

 	  /* This is not the fastest way to swap src + dst mac addresses */
 #define _(a) tmp0[a] = en0->src_address[a];
 	  foreach_mac_address_offset;
 #undef _
 #define _(a) en0->src_address[a] = en0->dst_address[a];
 	  foreach_mac_address_offset;
 #undef _
 #define _(a) en0->dst_address[a] = tmp0[a];
 	  foreach_mac_address_offset;
 #undef _

 	  sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];

 	  /* Send pkt back out the RX interface */
 	  vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;

 	  if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
 			     && (b0->flags & VLIB_BUFFER_IS_TRACED)))
 	    {
 	      sample_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t));
 	      t->sw_if_index = sw_if_index0;
 	      t->next_index = next0;
 	      clib_memcpy_fast (t->new_src_mac, en0->src_address,
 				sizeof (t->new_src_mac));
 	      clib_memcpy_fast (t->new_dst_mac, en0->dst_address,
 				sizeof (t->new_dst_mac));
 	    }

 	  pkts_swapped += 1;

 	  /* verify speculative enqueue, maybe switch current next frame */
 	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
 					   to_next, n_left_to_next,
 					   bi0, next0);
 	}

       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
     }

   vlib_node_increment_counter (vm, sample_node.index,
 			       SAMPLE_ERROR_SWAPPED, pkts_swapped);
   return frame->n_vectors;
 }
 #endif

 /*
  * This version swaps mac addresses using an MMX vector shuffle
  * Node costs about 17 clocks/pkt at a vector size of 26
  */
 #ifdef VERSION_2
 VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
 			    vlib_frame_t * frame)
 {
   u32 n_left_from, *from, *to_next;
   sample_next_t next_index;
   u32 pkts_swapped = 0;
   /* Vector shuffle mask to swap src, dst */
   u8x16 swapmac = { 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 12, 13, 14, 15 };

   from = vlib_frame_vector_args (frame);
   n_left_from = frame->n_vectors;
   next_index = node->cached_next_index;

   while (n_left_from > 0)
     {
       u32 n_left_to_next;

       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
       while (n_left_from >= 4 && n_left_to_next >= 2)
 	{
 	  u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
 	  u32 next1 = SAMPLE_NEXT_INTERFACE_OUTPUT;
 	  u32 sw_if_index0, sw_if_index1;
 	  u8x16 src_dst0, src_dst1;
 	  ethernet_header_t *en0, *en1;
 	  u32 bi0, bi1;
 	  vlib_buffer_t *b0, *b1;

 	  /* Prefetch next iteration. */
 	  {
 	    vlib_buffer_t *p2, *p3;

 	    p2 = vlib_get_buffer (vm, from[2]);
 	    p3 = vlib_get_buffer (vm, from[3]);

 	    vlib_prefetch_buffer_header (p2, LOAD);
 	    vlib_prefetch_buffer_header (p3, LOAD);

 	    CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
 	    CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
 	  }

 	  /* speculatively enqueue b0 and b1 to the current next frame */
 	  to_next[0] = bi0 = from[0];
 	  to_next[1] = bi1 = from[1];
 	  from += 2;
 	  to_next += 2;
 	  n_left_from -= 2;
 	  n_left_to_next -= 2;

 	  b0 = vlib_get_buffer (vm, bi0);
 	  b1 = vlib_get_buffer (vm, bi1);

 	  ASSERT (b0->current_data == 0);
 	  ASSERT (b1->current_data == 0);

 	  en0 = vlib_buffer_get_current (b0);
 	  en1 = vlib_buffer_get_current (b1);

 	  src_dst0 = ((u8x16 *) en0)[0];
 	  src_dst1 = ((u8x16 *) en1)[0];
 	  src_dst0 = u8x16_shuffle (src_dst0, swapmac);
 	  src_dst1 = u8x16_shuffle (src_dst1, swapmac);
 	  ((u8x16 *) en0)[0] = src_dst0;
 	  ((u8x16 *) en1)[0] = src_dst1;

 	  sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
 	  sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX];

 	  /* Send pkt back out the RX interface */
 	  vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
 	  vnet_buffer (b1)->sw_if_index[VLIB_TX] = sw_if_index1;

 	  pkts_swapped += 2;

 	  if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
 	    {
 	      if (b0->flags & VLIB_BUFFER_IS_TRACED)
 		{
 		  sample_trace_t *t =
 		    vlib_add_trace (vm, node, b0, sizeof (*t));
 		  t->sw_if_index = sw_if_index0;
 		  t->next_index = next0;
 		  clib_memcpy_fast (t->new_src_mac, en0->src_address,
 				    sizeof (t->new_src_mac));
 		  clib_memcpy_fast (t->new_dst_mac, en0->dst_address,
 				    sizeof (t->new_dst_mac));

 		}
 	      if (b1->flags & VLIB_BUFFER_IS_TRACED)
 		{
 		  sample_trace_t *t =
 		    vlib_add_trace (vm, node, b1, sizeof (*t));
 		  t->sw_if_index = sw_if_index1;
 		  t->next_index = next1;
 		  clib_memcpy_fast (t->new_src_mac, en1->src_address,
 				    sizeof (t->new_src_mac));
 		  clib_memcpy_fast (t->new_dst_mac, en1->dst_address,
 				    sizeof (t->new_dst_mac));
 		}
 	    }

 	  /* verify speculative enqueues, maybe switch current next frame */
 	  vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
 					   to_next, n_left_to_next,
 					   bi0, bi1, next0, next1);
 	}

       while (n_left_from > 0 && n_left_to_next > 0)
 	{
 	  u32 bi0;
 	  vlib_buffer_t *b0;
 	  u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
 	  u32 sw_if_index0;
 	  u8x16 src_dst0;
 	  ethernet_header_t *en0;

 	  /* speculatively enqueue b0 to the current next frame */
 	  bi0 = from[0];
 	  to_next[0] = bi0;
 	  from += 1;
 	  to_next += 1;
 	  n_left_from -= 1;
 	  n_left_to_next -= 1;

 	  b0 = vlib_get_buffer (vm, bi0);
 	  /*
 	   * Direct from the driver, we should be at offset 0
 	   * aka at &b0->data[0]
 	   */
 	  ASSERT (b0->current_data == 0);

 	  en0 = vlib_buffer_get_current (b0);
 	  src_dst0 = ((u8x16 *) en0)[0];
 	  src_dst0 = u8x16_shuffle (src_dst0, swapmac);
 	  ((u8x16 *) en0)[0] = src_dst0;

 	  sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];

 	  /* Send pkt back out the RX interface */
 	  vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;

 	  if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
 			     && (b0->flags & VLIB_BUFFER_IS_TRACED)))
 	    {
 	      sample_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t));
 	      t->sw_if_index = sw_if_index0;
 	      t->next_index = next0;
 	      clib_memcpy_fast (t->new_src_mac, en0->src_address,
 				sizeof (t->new_src_mac));
 	      clib_memcpy_fast (t->new_dst_mac, en0->dst_address,
 				sizeof (t->new_dst_mac));
 	    }

 	  pkts_swapped += 1;

 	  /* verify speculative enqueue, maybe switch current next frame */
 	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
 					   to_next, n_left_to_next,
 					   bi0, next0);
 	}

       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
     }

   vlib_node_increment_counter (vm, sample_node.index,
 			       SAMPLE_ERROR_SWAPPED, pkts_swapped);
   return frame->n_vectors;
 }
 #endif


 /*
  * This version computes all of the buffer pointers in
  * one motion, uses a quad/single loop model, and
  * traces the entire frame in one motion.
  *
  * Node costs about 16 clocks/pkt at a vector size of 26
  *
  * Some compilation drama with u8x16_shuffle, so turned off by
  * default.
  */

 #ifdef VERSION_3

 #define u8x16_shuffle __builtin_shuffle
 /* This would normally be a stack local, but since it's a constant... */
 static const u16 nexts[VLIB_FRAME_SIZE] = { 0 };

 VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
 			    vlib_frame_t * frame)
 {
   u32 n_left_from, *from;
   u32 pkts_swapped = 0;
   /* Vector shuffle mask to swap src, dst */
   u8x16 swapmac = { 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 12, 13, 14, 15 };
   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
   /* See comment below about sending all pkts to the same place... */
   u16 *next __attribute__ ((unused));

   from = vlib_frame_vector_args (frame);
   n_left_from = frame->n_vectors;

   vlib_get_buffers (vm, from, bufs, n_left_from);
   b = bufs;
   // next = nexts;

   /*
    * We send all pkts to SAMPLE_NEXT_INTERFACE_OUTPUT, aka
    * graph arc 0. So the usual setting of next[0...3] is commented
    * out below
    */

   while (n_left_from >= 4)
     {
       u8x16 src_dst0, src_dst1, src_dst2, src_dst3;
       /* Prefetch next iteration. */
       if (PREDICT_TRUE (n_left_from >= 8))
 	{
 	  vlib_prefetch_buffer_header (b[4], STORE);
 	  vlib_prefetch_buffer_header (b[5], STORE);
 	  vlib_prefetch_buffer_header (b[6], STORE);
 	  vlib_prefetch_buffer_header (b[7], STORE);
 	  CLIB_PREFETCH (&b[4]->data, CLIB_CACHE_LINE_BYTES, STORE);
 	  CLIB_PREFETCH (&b[5]->data, CLIB_CACHE_LINE_BYTES, STORE);
 	  CLIB_PREFETCH (&b[6]->data, CLIB_CACHE_LINE_BYTES, STORE);
 	  CLIB_PREFETCH (&b[7]->data, CLIB_CACHE_LINE_BYTES, STORE);
 	}

       src_dst0 = ((u8x16 *) vlib_buffer_get_current (b[0]))[0];
       src_dst1 = ((u8x16 *) vlib_buffer_get_current (b[1]))[0];
       src_dst2 = ((u8x16 *) vlib_buffer_get_current (b[2]))[0];
       src_dst3 = ((u8x16 *) vlib_buffer_get_current (b[3]))[0];

       src_dst0 = u8x16_shuffle (src_dst0, swapmac);
       src_dst1 = u8x16_shuffle (src_dst1, swapmac);
       src_dst2 = u8x16_shuffle (src_dst2, swapmac);
       src_dst3 = u8x16_shuffle (src_dst3, swapmac);

       ((u8x16 *) vlib_buffer_get_current (b[0]))[0] = src_dst0;
       ((u8x16 *) vlib_buffer_get_current (b[1]))[0] = src_dst1;
       ((u8x16 *) vlib_buffer_get_current (b[2]))[0] = src_dst2;
       ((u8x16 *) vlib_buffer_get_current (b[3]))[0] = src_dst3;

       vnet_buffer (b[0])->sw_if_index[VLIB_TX] =
 	vnet_buffer (b[0])->sw_if_index[VLIB_RX];
       vnet_buffer (b[1])->sw_if_index[VLIB_TX] =
 	vnet_buffer (b[1])->sw_if_index[VLIB_RX];
       vnet_buffer (b[2])->sw_if_index[VLIB_TX] =
 	vnet_buffer (b[2])->sw_if_index[VLIB_RX];
       vnet_buffer (b[3])->sw_if_index[VLIB_TX] =
 	vnet_buffer (b[3])->sw_if_index[VLIB_RX];

       // next[0] = SAMPLE_NEXT_INTERFACE_OUTPUT;
       // next[1] = SAMPLE_NEXT_INTERFACE_OUTPUT;
       // next[2] = SAMPLE_NEXT_INTERFACE_OUTPUT;
       // next[3] = SAMPLE_NEXT_INTERFACE_OUTPUT;

       b += 4;
       // next += 4;
       n_left_from -= 4;
       pkts_swapped += 4;
     }

   while (n_left_from > 0)
     {
       u8x16 src_dst0;
       src_dst0 = ((u8x16 *) vlib_buffer_get_current (b[0]))[0];
       src_dst0 = u8x16_shuffle (src_dst0, swapmac);
       ((u8x16 *) vlib_buffer_get_current (b[0]))[0] = src_dst0;
       vnet_buffer (b[0])->sw_if_index[VLIB_TX] =
 	vnet_buffer (b[0])->sw_if_index[VLIB_RX];
       // next[0] = SAMPLE_NEXT_INTERFACE_OUTPUT;

       b += 1;
       // next += 1;
       n_left_from -= 1;
       pkts_swapped += 1;

     }
   vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
 			       frame->n_vectors);

   vlib_node_increment_counter (vm, sample_node.index,
 			       SAMPLE_ERROR_SWAPPED, pkts_swapped);

   if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
     {
       int i;
       b = bufs;

       for (i = 0; i < frame->n_vectors; i++)
 	{
 	  if (b[0]->flags & VLIB_BUFFER_IS_TRACED)
 	    {
 	      ethernet_header_t *en;
 	      sample_trace_t *t =
 		vlib_add_trace (vm, node, b[0], sizeof (*t));
 	      t->sw_if_index = vnet_buffer (b[0])->sw_if_index[VLIB_TX];
 	      t->next_index = SAMPLE_NEXT_INTERFACE_OUTPUT;
 	      en = vlib_buffer_get_current (b[0]);
 	      clib_memcpy_fast (t->new_src_mac, en->src_address,
 				sizeof (t->new_src_mac));
 	      clib_memcpy_fast (t->new_dst_mac, en->dst_address,
 				sizeof (t->new_dst_mac));
 	      b++;
 	    }
 	  else
 	    break;
 	}
     }
   return frame->n_vectors;
 }
 #endif

 /*
  * This version computes all of the buffer pointers in
  * one motion, uses a fully pipelined loop model, and
  * traces the entire frame in one motion.
  *
  * It's performance-competative with other coding paradigms,
  * and it's the simplest way to write performant vpp code
  */


 #ifdef VERSION_4

 #define u8x16_shuffle __builtin_shuffle

 static u8x16 swapmac =
   { 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 12, 13, 14, 15 };

 /* Final stage in the pipeline, do the mac swap */
 static inline u32
 last_stage (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t * b)
 {
   u8x16 src_dst0;
   src_dst0 = ((u8x16 *) vlib_buffer_get_current (b))[0];
   src_dst0 = u8x16_shuffle (src_dst0, swapmac);
   ((u8x16 *) vlib_buffer_get_current (b))[0] = src_dst0;
   vnet_buffer (b)->sw_if_index[VLIB_TX] =
     vnet_buffer (b)->sw_if_index[VLIB_RX];
   /* set next-index[] to 0 for this buffer */
   return 0;
 }

 /*
  * Add a couple of nil stages to increase the prefetch stride.
  * For any specific platform, the optimal prefetch stride may differ.
  */
 static inline void
 stage1 (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t * b)
 {
 }

 static inline void
 stage2 (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t * b)
 {
 }

 #define NSTAGES 4
 #define STAGE_INLINE inline __attribute__((__always_inline__))

 #define stage0 generic_stage0

 #include <vnet/pipeline.h>

 VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
 			    vlib_frame_t * frame)
 {
   dispatch_pipeline (vm, node, frame);

   vlib_node_increment_counter (vm, sample_node.index,
 			       SAMPLE_ERROR_SWAPPED, frame->n_vectors);
   if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
     {
       int i;
       b = bufs;

       for (i = 0; i < frame->n_vectors; i++)
 	{
 	  if (b[0]->flags & VLIB_BUFFER_IS_TRACED)
 	    {
 	      ethernet_header_t *en;
 	      sample_trace_t *t =
 		vlib_add_trace (vm, node, b[0], sizeof (*t));
 	      t->sw_if_index = vnet_buffer (b[0])->sw_if_index[VLIB_TX];
 	      t->next_index = SAMPLE_NEXT_INTERFACE_OUTPUT;
 	      en = vlib_buffer_get_current (b[0]);
 	      clib_memcpy_fast (t->new_src_mac, en->src_address,
 				sizeof (t->new_src_mac));
 	      clib_memcpy_fast (t->new_dst_mac, en->dst_address,
 				sizeof (t->new_dst_mac));
 	      b++;
 	    }
 	  else
 	    break;
 	}
     }
   return frame->n_vectors;
 }
 #endif

 /* *INDENT-OFF* */
 VLIB_REGISTER_NODE (sample_node) =
 {
   .name = "sample",
   .vector_size = sizeof (u32),
   .format_trace = format_sample_trace,
   .type = VLIB_NODE_TYPE_INTERNAL,

   .n_errors = ARRAY_LEN(sample_error_strings),
   .error_strings = sample_error_strings,

   .n_next_nodes = SAMPLE_N_NEXT,

   /* edit / add dispositions here */
   .next_nodes = {
     [SAMPLE_NEXT_INTERFACE_OUTPUT] = "interface-output",
   },
 };
 /* *INDENT-ON* */

 /*
  * fd.io coding-style-patch-verification: ON
  *
  * Local Variables:
  * eval: (c-set-style "gnu")
  * End:
  */
	/*
	* Copyright (c) 2015 Cisco and/or its affiliates.
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at:
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	#include <vlib/vlib.h>
	#include <vnet/vnet.h>
	#include <vnet/pg/pg.h>
	#include <vnet/ethernet/ethernet.h>
	#include <vppinfra/error.h>
	#include <sample/sample.h>

	typedef struct
	{
	u32 next_index;
	u32 sw_if_index;
	u8 new_src_mac[6];
	u8 new_dst_mac[6];
	} sample_trace_t;


	/* packet trace format function */
	static u8 *
	format_sample_trace (u8 * s, va_list * args)
	{
	CLIB_UNUSED (vlib_main_t * vm) = va_arg (args, vlib_main_t );
	CLIB_UNUSED (vlib_node_t * node) = va_arg (args, vlib_node_t );
	sample_trace_t t = va_arg (args, sample_trace_t *);

	s = format (s, "SAMPLE: sw_if_index %d, next index %d\n",
	t->sw_if_index, t->next_index);
	s = format (s, " new src %U -> new dst %U",
	format_mac_address, t->new_src_mac,
	format_mac_address, t->new_dst_mac);

	return s;
	}

	extern vlib_node_registration_t sample_node;

	#define foreach_sample_error \
	_(SWAPPED, "Mac swap packets processed")

	typedef enum
	{
	#define _(sym,str) SAMPLE_ERROR_##sym,
	foreach_sample_error
	#undef _
	SAMPLE_N_ERROR,
	} sample_error_t;

	static char *sample_error_strings[] = {
	#define _(sym,string) string,
	foreach_sample_error
	#undef _
	};

	typedef enum
	{
	SAMPLE_NEXT_INTERFACE_OUTPUT,
	SAMPLE_N_NEXT,
	} sample_next_t;

	/*
	* Simple dual/single loop version, default version which will compile
	* everywhere.
	*
	* Node costs 30 clocks/pkt at a vector size of 51
	*/

	#define VERSION_1 1
	#ifdef VERSION_1
	#define foreach_mac_address_offset \
	_(0) \
	_(1) \
	_(2) \
	_(3) \
	_(4) \
	_(5)

	VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
	vlib_frame_t * frame)
	{
	u32 n_left_from, from, to_next;
	sample_next_t next_index;
	u32 pkts_swapped = 0;

	from = vlib_frame_vector_args (frame);
	n_left_from = frame->n_vectors;
	next_index = node->cached_next_index;

	while (n_left_from > 0)
	{
	u32 n_left_to_next;

	vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);

	while (n_left_from >= 4 && n_left_to_next >= 2)
	{
	u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
	u32 next1 = SAMPLE_NEXT_INTERFACE_OUTPUT;
	u32 sw_if_index0, sw_if_index1;
	u8 tmp0[6], tmp1[6];
	ethernet_header_t en0, en1;
	u32 bi0, bi1;
	vlib_buffer_t b0, b1;

	/* Prefetch next iteration. */
	{
	vlib_buffer_t p2, p3;

	p2 = vlib_get_buffer (vm, from[2]);
	p3 = vlib_get_buffer (vm, from[3]);

	vlib_prefetch_buffer_header (p2, LOAD);
	vlib_prefetch_buffer_header (p3, LOAD);

	CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
	CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
	}

	/* speculatively enqueue b0 and b1 to the current next frame */
	to_next[0] = bi0 = from[0];
	to_next[1] = bi1 = from[1];
	from += 2;
	to_next += 2;
	n_left_from -= 2;
	n_left_to_next -= 2;

	b0 = vlib_get_buffer (vm, bi0);
	b1 = vlib_get_buffer (vm, bi1);

	ASSERT (b0->current_data == 0);
	ASSERT (b1->current_data == 0);

	en0 = vlib_buffer_get_current (b0);
	en1 = vlib_buffer_get_current (b1);

	/* This is not the fastest way to swap src + dst mac addresses */
	#define _(a) tmp0[a] = en0->src_address[a];
	foreach_mac_address_offset;
	#undef _
	#define _(a) en0->src_address[a] = en0->dst_address[a];
	foreach_mac_address_offset;
	#undef _
	#define _(a) en0->dst_address[a] = tmp0[a];
	foreach_mac_address_offset;
	#undef _

	#define _(a) tmp1[a] = en1->src_address[a];
	foreach_mac_address_offset;
	#undef _
	#define _(a) en1->src_address[a] = en1->dst_address[a];
	foreach_mac_address_offset;
	#undef _
	#define _(a) en1->dst_address[a] = tmp1[a];
	foreach_mac_address_offset;
	#undef _

	sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
	sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX];

	/* Send pkt back out the RX interface */
	vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
	vnet_buffer (b1)->sw_if_index[VLIB_TX] = sw_if_index1;

	pkts_swapped += 2;

	if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
	{
	if (b0->flags & VLIB_BUFFER_IS_TRACED)
	{
	sample_trace_t *t =
	vlib_add_trace (vm, node, b0, sizeof (*t));
	t->sw_if_index = sw_if_index0;
	t->next_index = next0;
	clib_memcpy_fast (t->new_src_mac, en0->src_address,
	sizeof (t->new_src_mac));
	clib_memcpy_fast (t->new_dst_mac, en0->dst_address,
	sizeof (t->new_dst_mac));

	}
	if (b1->flags & VLIB_BUFFER_IS_TRACED)
	{
	sample_trace_t *t =
	vlib_add_trace (vm, node, b1, sizeof (*t));
	t->sw_if_index = sw_if_index1;
	t->next_index = next1;
	clib_memcpy_fast (t->new_src_mac, en1->src_address,
	sizeof (t->new_src_mac));
	clib_memcpy_fast (t->new_dst_mac, en1->dst_address,
	sizeof (t->new_dst_mac));
	}
	}

	/* verify speculative enqueues, maybe switch current next frame */
	vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
	to_next, n_left_to_next,
	bi0, bi1, next0, next1);
	}

	while (n_left_from > 0 && n_left_to_next > 0)
	{
	u32 bi0;
	vlib_buffer_t *b0;
	u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
	u32 sw_if_index0;
	u8 tmp0[6];
	ethernet_header_t *en0;

	/* speculatively enqueue b0 to the current next frame */
	bi0 = from[0];
	to_next[0] = bi0;
	from += 1;
	to_next += 1;
	n_left_from -= 1;
	n_left_to_next -= 1;

	b0 = vlib_get_buffer (vm, bi0);
	/*
	* Direct from the driver, we should be at offset 0
	* aka at &b0->data[0]
	*/
	ASSERT (b0->current_data == 0);

	en0 = vlib_buffer_get_current (b0);

	/* This is not the fastest way to swap src + dst mac addresses */
	#define _(a) tmp0[a] = en0->src_address[a];
	foreach_mac_address_offset;
	#undef _
	#define _(a) en0->src_address[a] = en0->dst_address[a];
	foreach_mac_address_offset;
	#undef _
	#define _(a) en0->dst_address[a] = tmp0[a];
	foreach_mac_address_offset;
	#undef _

	sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];

	/* Send pkt back out the RX interface */
	vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;

	if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
	&& (b0->flags & VLIB_BUFFER_IS_TRACED)))
	{
	sample_trace_t t = vlib_add_trace (vm, node, b0, sizeof (t));
	t->sw_if_index = sw_if_index0;
	t->next_index = next0;
	clib_memcpy_fast (t->new_src_mac, en0->src_address,
	sizeof (t->new_src_mac));
	clib_memcpy_fast (t->new_dst_mac, en0->dst_address,
	sizeof (t->new_dst_mac));
	}

	pkts_swapped += 1;

	/* verify speculative enqueue, maybe switch current next frame */
	vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
	to_next, n_left_to_next,
	bi0, next0);
	}

	vlib_put_next_frame (vm, node, next_index, n_left_to_next);
	}

	vlib_node_increment_counter (vm, sample_node.index,
	SAMPLE_ERROR_SWAPPED, pkts_swapped);
	return frame->n_vectors;
	}
	#endif

	/*
	* This version swaps mac addresses using an MMX vector shuffle
	* Node costs about 17 clocks/pkt at a vector size of 26
	*/
	#ifdef VERSION_2
	VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
	vlib_frame_t * frame)
	{
	u32 n_left_from, from, to_next;
	sample_next_t next_index;
	u32 pkts_swapped = 0;
	/* Vector shuffle mask to swap src, dst */
	u8x16 swapmac = { 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 12, 13, 14, 15 };

	from = vlib_frame_vector_args (frame);
	n_left_from = frame->n_vectors;
	next_index = node->cached_next_index;

	while (n_left_from > 0)
	{
	u32 n_left_to_next;

	vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
	while (n_left_from >= 4 && n_left_to_next >= 2)
	{
	u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
	u32 next1 = SAMPLE_NEXT_INTERFACE_OUTPUT;
	u32 sw_if_index0, sw_if_index1;
	u8x16 src_dst0, src_dst1;
	ethernet_header_t en0, en1;
	u32 bi0, bi1;
	vlib_buffer_t b0, b1;

	/* Prefetch next iteration. */
	{
	vlib_buffer_t p2, p3;

	p2 = vlib_get_buffer (vm, from[2]);
	p3 = vlib_get_buffer (vm, from[3]);

	vlib_prefetch_buffer_header (p2, LOAD);
	vlib_prefetch_buffer_header (p3, LOAD);

	CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
	CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
	}

	/* speculatively enqueue b0 and b1 to the current next frame */
	to_next[0] = bi0 = from[0];
	to_next[1] = bi1 = from[1];
	from += 2;
	to_next += 2;
	n_left_from -= 2;
	n_left_to_next -= 2;

	b0 = vlib_get_buffer (vm, bi0);
	b1 = vlib_get_buffer (vm, bi1);

	ASSERT (b0->current_data == 0);
	ASSERT (b1->current_data == 0);

	en0 = vlib_buffer_get_current (b0);
	en1 = vlib_buffer_get_current (b1);

	src_dst0 = ((u8x16 *) en0)[0];
	src_dst1 = ((u8x16 *) en1)[0];
	src_dst0 = u8x16_shuffle (src_dst0, swapmac);
	src_dst1 = u8x16_shuffle (src_dst1, swapmac);
	((u8x16 *) en0)[0] = src_dst0;
	((u8x16 *) en1)[0] = src_dst1;

	sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
	sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX];

	/* Send pkt back out the RX interface */
	vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
	vnet_buffer (b1)->sw_if_index[VLIB_TX] = sw_if_index1;

	pkts_swapped += 2;

	if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
	{
	if (b0->flags & VLIB_BUFFER_IS_TRACED)
	{
	sample_trace_t *t =
	vlib_add_trace (vm, node, b0, sizeof (*t));
	t->sw_if_index = sw_if_index0;
	t->next_index = next0;
	clib_memcpy_fast (t->new_src_mac, en0->src_address,
	sizeof (t->new_src_mac));
	clib_memcpy_fast (t->new_dst_mac, en0->dst_address,
	sizeof (t->new_dst_mac));

	}
	if (b1->flags & VLIB_BUFFER_IS_TRACED)
	{
	sample_trace_t *t =
	vlib_add_trace (vm, node, b1, sizeof (*t));
	t->sw_if_index = sw_if_index1;
	t->next_index = next1;
	clib_memcpy_fast (t->new_src_mac, en1->src_address,
	sizeof (t->new_src_mac));
	clib_memcpy_fast (t->new_dst_mac, en1->dst_address,
	sizeof (t->new_dst_mac));
	}
	}

	/* verify speculative enqueues, maybe switch current next frame */
	vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
	to_next, n_left_to_next,
	bi0, bi1, next0, next1);
	}

	while (n_left_from > 0 && n_left_to_next > 0)
	{
	u32 bi0;
	vlib_buffer_t *b0;
	u32 next0 = SAMPLE_NEXT_INTERFACE_OUTPUT;
	u32 sw_if_index0;
	u8x16 src_dst0;
	ethernet_header_t *en0;

	/* speculatively enqueue b0 to the current next frame */
	bi0 = from[0];
	to_next[0] = bi0;
	from += 1;
	to_next += 1;
	n_left_from -= 1;
	n_left_to_next -= 1;

	b0 = vlib_get_buffer (vm, bi0);
	/*
	* Direct from the driver, we should be at offset 0
	* aka at &b0->data[0]
	*/
	ASSERT (b0->current_data == 0);

	en0 = vlib_buffer_get_current (b0);
	src_dst0 = ((u8x16 *) en0)[0];
	src_dst0 = u8x16_shuffle (src_dst0, swapmac);
	((u8x16 *) en0)[0] = src_dst0;

	sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];

	/* Send pkt back out the RX interface */
	vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;

	if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
	&& (b0->flags & VLIB_BUFFER_IS_TRACED)))
	{
	sample_trace_t t = vlib_add_trace (vm, node, b0, sizeof (t));
	t->sw_if_index = sw_if_index0;
	t->next_index = next0;
	clib_memcpy_fast (t->new_src_mac, en0->src_address,
	sizeof (t->new_src_mac));
	clib_memcpy_fast (t->new_dst_mac, en0->dst_address,
	sizeof (t->new_dst_mac));
	}

	pkts_swapped += 1;

	/* verify speculative enqueue, maybe switch current next frame */
	vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
	to_next, n_left_to_next,
	bi0, next0);
	}

	vlib_put_next_frame (vm, node, next_index, n_left_to_next);
	}

	vlib_node_increment_counter (vm, sample_node.index,
	SAMPLE_ERROR_SWAPPED, pkts_swapped);
	return frame->n_vectors;
	}
	#endif


	/*
	* This version computes all of the buffer pointers in
	* one motion, uses a quad/single loop model, and
	* traces the entire frame in one motion.
	*
	* Node costs about 16 clocks/pkt at a vector size of 26
	*
	* Some compilation drama with u8x16_shuffle, so turned off by
	* default.
	*/

	#ifdef VERSION_3

	#define u8x16_shuffle __builtin_shuffle
	/* This would normally be a stack local, but since it's a constant... */
	static const u16 nexts[VLIB_FRAME_SIZE] = { 0 };

	VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
	vlib_frame_t * frame)
	{
	u32 n_left_from, *from;
	u32 pkts_swapped = 0;
	/* Vector shuffle mask to swap src, dst */
	u8x16 swapmac = { 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 12, 13, 14, 15 };
	vlib_buffer_t bufs[VLIB_FRAME_SIZE], *b;
	/* See comment below about sending all pkts to the same place... */
	u16 *next __attribute__ ((unused));

	from = vlib_frame_vector_args (frame);
	n_left_from = frame->n_vectors;

	vlib_get_buffers (vm, from, bufs, n_left_from);
	b = bufs;
	// next = nexts;

	/*
	* We send all pkts to SAMPLE_NEXT_INTERFACE_OUTPUT, aka
	* graph arc 0. So the usual setting of next[0...3] is commented
	* out below
	*/

	while (n_left_from >= 4)
	{
	u8x16 src_dst0, src_dst1, src_dst2, src_dst3;
	/* Prefetch next iteration. */
	if (PREDICT_TRUE (n_left_from >= 8))
	{
	vlib_prefetch_buffer_header (b[4], STORE);
	vlib_prefetch_buffer_header (b[5], STORE);
	vlib_prefetch_buffer_header (b[6], STORE);
	vlib_prefetch_buffer_header (b[7], STORE);
	CLIB_PREFETCH (&b[4]->data, CLIB_CACHE_LINE_BYTES, STORE);
	CLIB_PREFETCH (&b[5]->data, CLIB_CACHE_LINE_BYTES, STORE);
	CLIB_PREFETCH (&b[6]->data, CLIB_CACHE_LINE_BYTES, STORE);
	CLIB_PREFETCH (&b[7]->data, CLIB_CACHE_LINE_BYTES, STORE);
	}

	src_dst0 = ((u8x16 *) vlib_buffer_get_current (b[0]))[0];
	src_dst1 = ((u8x16 *) vlib_buffer_get_current (b[1]))[0];
	src_dst2 = ((u8x16 *) vlib_buffer_get_current (b[2]))[0];
	src_dst3 = ((u8x16 *) vlib_buffer_get_current (b[3]))[0];

	src_dst0 = u8x16_shuffle (src_dst0, swapmac);
	src_dst1 = u8x16_shuffle (src_dst1, swapmac);
	src_dst2 = u8x16_shuffle (src_dst2, swapmac);
	src_dst3 = u8x16_shuffle (src_dst3, swapmac);

	((u8x16 *) vlib_buffer_get_current (b[0]))[0] = src_dst0;
	((u8x16 *) vlib_buffer_get_current (b[1]))[0] = src_dst1;
	((u8x16 *) vlib_buffer_get_current (b[2]))[0] = src_dst2;
	((u8x16 *) vlib_buffer_get_current (b[3]))[0] = src_dst3;

	vnet_buffer (b[0])->sw_if_index[VLIB_TX] =
	vnet_buffer (b[0])->sw_if_index[VLIB_RX];
	vnet_buffer (b[1])->sw_if_index[VLIB_TX] =
	vnet_buffer (b[1])->sw_if_index[VLIB_RX];
	vnet_buffer (b[2])->sw_if_index[VLIB_TX] =
	vnet_buffer (b[2])->sw_if_index[VLIB_RX];
	vnet_buffer (b[3])->sw_if_index[VLIB_TX] =
	vnet_buffer (b[3])->sw_if_index[VLIB_RX];

	// next[0] = SAMPLE_NEXT_INTERFACE_OUTPUT;
	// next[1] = SAMPLE_NEXT_INTERFACE_OUTPUT;
	// next[2] = SAMPLE_NEXT_INTERFACE_OUTPUT;
	// next[3] = SAMPLE_NEXT_INTERFACE_OUTPUT;

	b += 4;
	// next += 4;
	n_left_from -= 4;
	pkts_swapped += 4;
	}

	while (n_left_from > 0)
	{
	u8x16 src_dst0;
	src_dst0 = ((u8x16 *) vlib_buffer_get_current (b[0]))[0];
	src_dst0 = u8x16_shuffle (src_dst0, swapmac);
	((u8x16 *) vlib_buffer_get_current (b[0]))[0] = src_dst0;
	vnet_buffer (b[0])->sw_if_index[VLIB_TX] =
	vnet_buffer (b[0])->sw_if_index[VLIB_RX];
	// next[0] = SAMPLE_NEXT_INTERFACE_OUTPUT;

	b += 1;
	// next += 1;
	n_left_from -= 1;
	pkts_swapped += 1;

	}
	vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
	frame->n_vectors);

	vlib_node_increment_counter (vm, sample_node.index,
	SAMPLE_ERROR_SWAPPED, pkts_swapped);

	if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
	{
	int i;
	b = bufs;

	for (i = 0; i < frame->n_vectors; i++)
	{
	if (b[0]->flags & VLIB_BUFFER_IS_TRACED)
	{
	ethernet_header_t *en;
	sample_trace_t *t =
	vlib_add_trace (vm, node, b[0], sizeof (*t));
	t->sw_if_index = vnet_buffer (b[0])->sw_if_index[VLIB_TX];
	t->next_index = SAMPLE_NEXT_INTERFACE_OUTPUT;
	en = vlib_buffer_get_current (b[0]);
	clib_memcpy_fast (t->new_src_mac, en->src_address,
	sizeof (t->new_src_mac));
	clib_memcpy_fast (t->new_dst_mac, en->dst_address,
	sizeof (t->new_dst_mac));
	b++;
	}
	else
	break;
	}
	}
	return frame->n_vectors;
	}
	#endif

	/*
	* This version computes all of the buffer pointers in
	* one motion, uses a fully pipelined loop model, and
	* traces the entire frame in one motion.
	*
	* It's performance-competative with other coding paradigms,
	* and it's the simplest way to write performant vpp code
	*/


	#ifdef VERSION_4

	#define u8x16_shuffle __builtin_shuffle

	static u8x16 swapmac =
	{ 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 12, 13, 14, 15 };

	/* Final stage in the pipeline, do the mac swap */
	static inline u32
	last_stage (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t * b)
	{
	u8x16 src_dst0;
	src_dst0 = ((u8x16 *) vlib_buffer_get_current (b))[0];
	src_dst0 = u8x16_shuffle (src_dst0, swapmac);
	((u8x16 *) vlib_buffer_get_current (b))[0] = src_dst0;
	vnet_buffer (b)->sw_if_index[VLIB_TX] =
	vnet_buffer (b)->sw_if_index[VLIB_RX];
	/* set next-index[] to 0 for this buffer */
	return 0;
	}

	/*
	* Add a couple of nil stages to increase the prefetch stride.
	* For any specific platform, the optimal prefetch stride may differ.
	*/
	static inline void
	stage1 (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t * b)
	{
	}

	static inline void
	stage2 (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t * b)
	{
	}

	#define NSTAGES 4
	#define STAGE_INLINE inline __attribute__((__always_inline__))

	#define stage0 generic_stage0

	#include <vnet/pipeline.h>

	VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
	vlib_frame_t * frame)
	{
	dispatch_pipeline (vm, node, frame);

	vlib_node_increment_counter (vm, sample_node.index,
	SAMPLE_ERROR_SWAPPED, frame->n_vectors);
	if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
	{
	int i;
	b = bufs;

	for (i = 0; i < frame->n_vectors; i++)
	{
	if (b[0]->flags & VLIB_BUFFER_IS_TRACED)
	{
	ethernet_header_t *en;
	sample_trace_t *t =
	vlib_add_trace (vm, node, b[0], sizeof (*t));
	t->sw_if_index = vnet_buffer (b[0])->sw_if_index[VLIB_TX];
	t->next_index = SAMPLE_NEXT_INTERFACE_OUTPUT;
	en = vlib_buffer_get_current (b[0]);
	clib_memcpy_fast (t->new_src_mac, en->src_address,
	sizeof (t->new_src_mac));
	clib_memcpy_fast (t->new_dst_mac, en->dst_address,
	sizeof (t->new_dst_mac));
	b++;
	}
	else
	break;
	}
	}
	return frame->n_vectors;
	}
	#endif

	/* INDENT-OFF */
	VLIB_REGISTER_NODE (sample_node) =
	{
	.name = "sample",
	.vector_size = sizeof (u32),
	.format_trace = format_sample_trace,
	.type = VLIB_NODE_TYPE_INTERNAL,

	.n_errors = ARRAY_LEN(sample_error_strings),
	.error_strings = sample_error_strings,

	.n_next_nodes = SAMPLE_N_NEXT,

	/* edit / add dispositions here */
	.next_nodes = {
	[SAMPLE_NEXT_INTERFACE_OUTPUT] = "interface-output",
	},
	};
	/* INDENT-ON */

	/*
	* fd.io coding-style-patch-verification: ON
	*
	* Local Variables:
	* eval: (c-set-style "gnu")
	* End:
	*/