misc: add vnet/pipeline.h example To the sample plugin. We should probably suggest that folks use the pipeline.h coding model more often. It's really easy, and these days the performance results are similar to quad-single loop coding. Type: refactor Change-Id: Ie2caa087972737e6d9c31c4ac79355f3d8ced282 Signed-off-by: Dave Barach <dave@barachs.net>

commit: d56550c2b669558aa38d93f44a9a3b31e0b9370f [log] [tgz]
author: Dave Barach <dbarach@cisco.com> Fri Jul 26 11:58:16 2019 -0400
committer: Florin Coras <florin.coras@gmail.com> Mon Jul 29 22:35:19 2019 +0000
tree: fc5c06b5b2586dbda4086bbec425d01e29984f04
parent: 2b5fed8696ce2a9b67e63cf5b5dbf49505172c9a [diff]
diff --git a/src/examples/sample-plugin/sample/node.c b/src/examples/sample-plugin/sample/node.c
index 1f0a2e9..2c71b48 100644
--- a/src/examples/sample-plugin/sample/node.c
+++ b/src/examples/sample-plugin/sample/node.c

@@ -76,8 +76,8 @@
  *
  * Node costs 30 clocks/pkt at a vector size of 51
  */
-#define VERSION_1 1
 
+#define VERSION_1 1
 #ifdef VERSION_1
 #define foreach_mac_address_offset              \
 _(0)                                            \
@@ -599,6 +599,94 @@
 }
 #endif
 
+/*
+ * This version computes all of the buffer pointers in
+ * one motion, uses a fully pipelined loop model, and
+ * traces the entire frame in one motion.
+ *
+ * It's performance-competative with other coding paradigms,
+ * and it's the simplest way to write performant vpp code
+ */
+
+
+#ifdef VERSION_4
+
+#define u8x16_shuffle __builtin_shuffle
+
+static u8x16 swapmac =
+  { 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 12, 13, 14, 15 };
+
+/* Final stage in the pipeline, do the mac swap */
+static inline u32
+last_stage (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t * b)
+{
+  u8x16 src_dst0;
+  src_dst0 = ((u8x16 *) vlib_buffer_get_current (b))[0];
+  src_dst0 = u8x16_shuffle (src_dst0, swapmac);
+  ((u8x16 *) vlib_buffer_get_current (b))[0] = src_dst0;
+  vnet_buffer (b)->sw_if_index[VLIB_TX] =
+    vnet_buffer (b)->sw_if_index[VLIB_RX];
+  /* set next-index[] to 0 for this buffer */
+  return 0;
+}
+
+/*
+ * Add a couple of nil stages to increase the prefetch stride.
+ * For any specific platform, the optimal prefetch stride may differ.
+ */
+static inline void
+stage1 (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t * b)
+{
+}
+
+static inline void
+stage2 (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t * b)
+{
+}
+
+#define NSTAGES 4
+#define STAGE_INLINE inline __attribute__((__always_inline__))
+
+#define stage0 generic_stage0
+
+#include <vnet/pipeline.h>
+
+VLIB_NODE_FN (sample_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
+			    vlib_frame_t * frame)
+{
+  dispatch_pipeline (vm, node, frame);
+
+  vlib_node_increment_counter (vm, sample_node.index,
+			       SAMPLE_ERROR_SWAPPED, frame->n_vectors);
+  if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
+    {
+      int i;
+      b = bufs;
+
+      for (i = 0; i < frame->n_vectors; i++)
+	{
+	  if (b[0]->flags & VLIB_BUFFER_IS_TRACED)
+	    {
+	      ethernet_header_t *en;
+	      sample_trace_t *t =
+		vlib_add_trace (vm, node, b[0], sizeof (*t));
+	      t->sw_if_index = vnet_buffer (b[0])->sw_if_index[VLIB_TX];
+	      t->next_index = SAMPLE_NEXT_INTERFACE_OUTPUT;
+	      en = vlib_buffer_get_current (b[0]);
+	      clib_memcpy_fast (t->new_src_mac, en->src_address,
+				sizeof (t->new_src_mac));
+	      clib_memcpy_fast (t->new_dst_mac, en->dst_address,
+				sizeof (t->new_dst_mac));
+	      b++;
+	    }
+	  else
+	    break;
+	}
+    }
+  return frame->n_vectors;
+}
+#endif
+
 /* *INDENT-OFF* */
 VLIB_REGISTER_NODE (sample_node) =
 {
commit	d56550c2b669558aa38d93f44a9a3b31e0b9370f	[log] [tgz]
author	Dave Barach <dbarach@cisco.com>	Fri Jul 26 11:58:16 2019 -0400
committer	Florin Coras <florin.coras@gmail.com>	Mon Jul 29 22:35:19 2019 +0000
tree	fc5c06b5b2586dbda4086bbec425d01e29984f04
parent	2b5fed8696ce2a9b67e63cf5b5dbf49505172c9a [diff]