MPLS performance improvments.

 1 - Quad loop lookup and label imposition.
 2 - optimise imposition for the 1 label case
 3 - input gets TTL from header directly (no byte swap)

Change-Id: I59204c9e5d134b0df75d7afa43e360f946d1ffe7
Signed-off-by: Neale Ranns <nranns@cisco.com>
diff --git a/src/vnet/dpo/mpls_label_dpo.c b/src/vnet/dpo/mpls_label_dpo.c
index bbdc966..be9b285 100644
--- a/src/vnet/dpo/mpls_label_dpo.c
+++ b/src/vnet/dpo/mpls_label_dpo.c
@@ -160,6 +160,33 @@
     mpls_unicast_header_t hdr;
 } mpls_label_imposition_trace_t;
 
+always_inline mpls_unicast_header_t *
+mpls_label_paint (vlib_buffer_t * b0,
+                  mpls_label_dpo_t *mld0,
+                  u8 ttl0)
+{
+    mpls_unicast_header_t *hdr0;
+
+    vlib_buffer_advance(b0, -(mld0->mld_n_hdr_bytes));
+
+    hdr0 = vlib_buffer_get_current(b0);
+
+    if (PREDICT_TRUE(1 == mld0->mld_n_labels))
+    {
+        /* optimise for the common case of one label */
+        *hdr0 = mld0->mld_hdr[0];
+    }
+    else
+    {
+        clib_memcpy(hdr0, mld0->mld_hdr, mld0->mld_n_hdr_bytes);
+        hdr0 = hdr0 + (mld0->mld_n_labels - 1);
+    }
+    /* fixup the TTL for the inner most label */
+    ((char*)hdr0)[3] = ttl0;
+
+    return (hdr0);
+}
+
 always_inline uword
 mpls_label_imposition_inline (vlib_main_t * vm,
                               vlib_node_runtime_t * node,
@@ -180,45 +207,59 @@
 
         vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
 
-        while (n_left_from >= 4 && n_left_to_next >= 2)
+        while (n_left_from >= 8 && n_left_to_next >= 4)
         {
-            mpls_unicast_header_t *hdr0, *hdr1;
-            mpls_label_dpo_t *mld0, *mld1;
-            u32 bi0, mldi0, bi1, mldi1;
-            vlib_buffer_t * b0, *b1;
-            u32 next0, next1;
-            u8 ttl0, ttl1;
+            u32 bi0, mldi0, bi1, mldi1, bi2, mldi2, bi3, mldi3;
+            mpls_unicast_header_t *hdr0, *hdr1, *hdr2, *hdr3;
+            mpls_label_dpo_t *mld0, *mld1, *mld2, *mld3;
+            vlib_buffer_t * b0, *b1, * b2, *b3;
+            u32 next0, next1, next2, next3;
+            u8 ttl0, ttl1,ttl2, ttl3 ;
 
             bi0 = to_next[0] = from[0];
             bi1 = to_next[1] = from[1];
+            bi2 = to_next[2] = from[2];
+            bi3 = to_next[3] = from[3];
 
             /* Prefetch next iteration. */
             {
-                vlib_buffer_t * p2, * p3;
+                vlib_buffer_t * p2, * p3, *p4, *p5;
 
                 p2 = vlib_get_buffer (vm, from[2]);
                 p3 = vlib_get_buffer (vm, from[3]);
+                p4 = vlib_get_buffer (vm, from[4]);
+                p5 = vlib_get_buffer (vm, from[5]);
 
                 vlib_prefetch_buffer_header (p2, STORE);
                 vlib_prefetch_buffer_header (p3, STORE);
+                vlib_prefetch_buffer_header (p4, STORE);
+                vlib_prefetch_buffer_header (p5, STORE);
 
                 CLIB_PREFETCH (p2->data, sizeof (hdr0[0]), STORE);
                 CLIB_PREFETCH (p3->data, sizeof (hdr0[0]), STORE);
+                CLIB_PREFETCH (p4->data, sizeof (hdr0[0]), STORE);
+                CLIB_PREFETCH (p5->data, sizeof (hdr0[0]), STORE);
             }
 
-            from += 2;
-            to_next += 2;
-            n_left_from -= 2;
-            n_left_to_next -= 2;
+            from += 4;
+            to_next += 4;
+            n_left_from -= 4;
+            n_left_to_next -= 4;
 
             b0 = vlib_get_buffer (vm, bi0);
             b1 = vlib_get_buffer (vm, bi1);
+            b2 = vlib_get_buffer (vm, bi2);
+            b3 = vlib_get_buffer (vm, bi3);
 
             /* dst lookup was done by ip4 lookup */
             mldi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX];
             mldi1 = vnet_buffer(b1)->ip.adj_index[VLIB_TX];
+            mldi2 = vnet_buffer(b2)->ip.adj_index[VLIB_TX];
+            mldi3 = vnet_buffer(b3)->ip.adj_index[VLIB_TX];
             mld0 = mpls_label_dpo_get(mldi0);
             mld1 = mpls_label_dpo_get(mldi1);
+            mld2 = mpls_label_dpo_get(mldi2);
+            mld3 = mpls_label_dpo_get(mldi3);
 
             if (payload_is_ip4)
             {
@@ -227,23 +268,37 @@
                  */
                 ip4_header_t * ip0 = vlib_buffer_get_current(b0);
                 ip4_header_t * ip1 = vlib_buffer_get_current(b1);
+                ip4_header_t * ip2 = vlib_buffer_get_current(b2);
+                ip4_header_t * ip3 = vlib_buffer_get_current(b3);
                 u32 checksum0;
                 u32 checksum1;
+                u32 checksum2;
+                u32 checksum3;
 
                 checksum0 = ip0->checksum + clib_host_to_net_u16 (0x0100);
                 checksum1 = ip1->checksum + clib_host_to_net_u16 (0x0100);
+                checksum2 = ip2->checksum + clib_host_to_net_u16 (0x0100);
+                checksum3 = ip3->checksum + clib_host_to_net_u16 (0x0100);
 
                 checksum0 += checksum0 >= 0xffff;
                 checksum1 += checksum1 >= 0xffff;
+                checksum2 += checksum2 >= 0xffff;
+                checksum3 += checksum3 >= 0xffff;
 
                 ip0->checksum = checksum0;
                 ip1->checksum = checksum1;
+                ip2->checksum = checksum2;
+                ip3->checksum = checksum3;
 
                 ip0->ttl -= 1;
                 ip1->ttl -= 1;
+                ip2->ttl -= 1;
+                ip3->ttl -= 1;
 
                 ttl1 = ip1->ttl;
                 ttl0 = ip0->ttl;
+                ttl3 = ip3->ttl;
+                ttl2 = ip2->ttl;
             }
             else if (payload_is_ip6)
             {
@@ -252,13 +307,18 @@
                  */
                 ip6_header_t * ip0 = vlib_buffer_get_current(b0);
                 ip6_header_t * ip1 = vlib_buffer_get_current(b1);
-
+                ip6_header_t * ip2 = vlib_buffer_get_current(b2);
+                ip6_header_t * ip3 = vlib_buffer_get_current(b3);
 
                 ip0->hop_limit -= 1;
                 ip1->hop_limit -= 1;
+                ip2->hop_limit -= 1;
+                ip3->hop_limit -= 1;
 
                 ttl0 = ip0->hop_limit;
                 ttl1 = ip1->hop_limit;
+                ttl2 = ip2->hop_limit;
+                ttl3 = ip3->hop_limit;
             }
             else
             {
@@ -294,30 +354,45 @@
                 {
                     ttl1 = 255;
                 }
+                if (PREDICT_TRUE(vnet_buffer(b2)->mpls.first))
+                {
+                    ASSERT(2 != vnet_buffer (b2)->mpls.ttl);
+
+                    ttl2 = vnet_buffer(b2)->mpls.ttl - 1;
+                }
+                else
+                {
+                    ttl2 = 255;
+                }
+                if (PREDICT_TRUE(vnet_buffer(b3)->mpls.first))
+                {
+                    ASSERT(1 != vnet_buffer (b3)->mpls.ttl);
+                    ttl3 = vnet_buffer(b3)->mpls.ttl - 1;
+                }
+                else
+                {
+                    ttl3 = 255;
+                }
             }
             vnet_buffer(b0)->mpls.first = 0;
             vnet_buffer(b1)->mpls.first = 0;
+            vnet_buffer(b2)->mpls.first = 0;
+            vnet_buffer(b3)->mpls.first = 0;
 
             /* Paint the MPLS header */
-            vlib_buffer_advance(b0, -(mld0->mld_n_hdr_bytes));
-            vlib_buffer_advance(b1, -(mld1->mld_n_hdr_bytes));
-
-            hdr0 = vlib_buffer_get_current(b0);
-            hdr1 = vlib_buffer_get_current(b1);
-
-            clib_memcpy(hdr0, mld0->mld_hdr, mld0->mld_n_hdr_bytes);
-            clib_memcpy(hdr1, mld1->mld_hdr, mld1->mld_n_hdr_bytes);
-
-            /* fixup the TTL for the inner most label */
-            hdr0 = hdr0 + (mld0->mld_n_labels - 1);
-            hdr1 = hdr1 + (mld1->mld_n_labels - 1);
-            ((char*)hdr0)[3] = ttl0;
-            ((char*)hdr1)[3] = ttl1;
+            hdr0 = mpls_label_paint(b0, mld0, ttl0);
+            hdr1 = mpls_label_paint(b1, mld1, ttl1);
+            hdr2 = mpls_label_paint(b2, mld2, ttl2);
+            hdr3 = mpls_label_paint(b3, mld3, ttl3);
 
             next0 = mld0->mld_dpo.dpoi_next_node;
             next1 = mld1->mld_dpo.dpoi_next_node;
+            next2 = mld2->mld_dpo.dpoi_next_node;
+            next3 = mld3->mld_dpo.dpoi_next_node;
             vnet_buffer(b0)->ip.adj_index[VLIB_TX] = mld0->mld_dpo.dpoi_index;
             vnet_buffer(b1)->ip.adj_index[VLIB_TX] = mld1->mld_dpo.dpoi_index;
+            vnet_buffer(b2)->ip.adj_index[VLIB_TX] = mld2->mld_dpo.dpoi_index;
+            vnet_buffer(b3)->ip.adj_index[VLIB_TX] = mld3->mld_dpo.dpoi_index;
 
             if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
             {
@@ -331,10 +406,23 @@
                     vlib_add_trace (vm, node, b1, sizeof (*tr));
                 tr->hdr = *hdr1;
             }
+            if (PREDICT_FALSE(b2->flags & VLIB_BUFFER_IS_TRACED))
+            {
+                mpls_label_imposition_trace_t *tr =
+                    vlib_add_trace (vm, node, b2, sizeof (*tr));
+                tr->hdr = *hdr2;
+            }
+            if (PREDICT_FALSE(b3->flags & VLIB_BUFFER_IS_TRACED))
+            {
+                mpls_label_imposition_trace_t *tr =
+                    vlib_add_trace (vm, node, b3, sizeof (*tr));
+                tr->hdr = *hdr3;
+            }
 
-            vlib_validate_buffer_enqueue_x2(vm, node, next_index, to_next,
+            vlib_validate_buffer_enqueue_x4(vm, node, next_index, to_next,
                                             n_left_to_next,
-                                            bi0, bi1, next0, next1);
+                                            bi0, bi1, bi2, bi3,
+                                            next0, next1, next2, next3);
         }
 
         while (n_left_from > 0 && n_left_to_next > 0)
diff --git a/src/vnet/dpo/mpls_label_dpo.h b/src/vnet/dpo/mpls_label_dpo.h
index 89bcb09..e23f3d2 100644
--- a/src/vnet/dpo/mpls_label_dpo.h
+++ b/src/vnet/dpo/mpls_label_dpo.h
@@ -61,8 +61,8 @@
  * Should this get any bigger then we will need to reconsider how many labels
  * can be pushed in one object.
  */
-_Static_assert((sizeof(mpls_label_dpo_t) <= CLIB_CACHE_LINE_BYTES),
-	       "MPLS label DPO is larger than one cache line.");
+STATIC_ASSERT((sizeof(mpls_label_dpo_t) <= CLIB_CACHE_LINE_BYTES),
+              "MPLS label DPO is larger than one cache line.");
 
 /**
  * @brief Create an MPLS label object
diff --git a/src/vnet/mpls/error.def b/src/vnet/mpls/error.def
index de8b966..34a4652 100644
--- a/src/vnet/mpls/error.def
+++ b/src/vnet/mpls/error.def
@@ -18,11 +18,11 @@
 mpls_error (NONE, "no error")
 mpls_error (UNKNOWN_PROTOCOL, "unknown protocol")
 mpls_error (UNSUPPORTED_VERSION, "unsupported version")
-mpls_error (PKTS_DECAP, "MPLS-GRE input packets decapsulated")
-mpls_error (PKTS_ENCAP, "MPLS-GRE output packets encapsulated")
-mpls_error (NO_LABEL, "MPLS-GRE no label for fib/dst")
-mpls_error (TTL_EXPIRED, "MPLS-GRE ttl expired")
-mpls_error (S_NOT_SET, "MPLS-GRE s-bit not set")
+mpls_error (PKTS_DECAP, "MPLS input packets decapsulated")
+mpls_error (PKTS_ENCAP, "MPLS output packets encapsulated")
+mpls_error (NO_LABEL, "MPLS no label for fib/dst")
+mpls_error (TTL_EXPIRED, "MPLS ttl expired")
+mpls_error (S_NOT_SET, "MPLS s-bit not set")
 mpls_error (BAD_LABEL, "invalid FIB id in label")
 mpls_error (NOT_IP4, "non-ip4 packets dropped")
 mpls_error (DISALLOWED_FIB, "disallowed FIB id")
diff --git a/src/vnet/mpls/mpls.c b/src/vnet/mpls/mpls.c
index 7ae4aa0..482577b 100644
--- a/src/vnet/mpls/mpls.c
+++ b/src/vnet/mpls/mpls.c
@@ -161,6 +161,14 @@
                  &h_host);
 }
 
+typedef struct {
+  u32 fib_index;
+  u32 entry_index;
+  u32 dest;
+  u32 s_bit;
+  u32 label;
+} show_mpls_fib_t;
+
 int
 mpls_dest_cmp(void * a1, void * a2)
 {
diff --git a/src/vnet/mpls/mpls.h b/src/vnet/mpls/mpls.h
index b6fdbce..300f2cf 100644
--- a/src/vnet/mpls/mpls.h
+++ b/src/vnet/mpls/mpls.h
@@ -86,16 +86,12 @@
 
 extern clib_error_t * mpls_feature_init(vlib_main_t * vm);
 
-format_function_t format_mpls_protocol;
-format_function_t format_mpls_encap_index;
-
 format_function_t format_mpls_eos_bit;
 format_function_t format_mpls_unicast_header_net_byte_order;
 format_function_t format_mpls_unicast_label;
 format_function_t format_mpls_header;
 
 extern vlib_node_registration_t mpls_input_node;
-extern vlib_node_registration_t mpls_policy_encap_node;
 extern vlib_node_registration_t mpls_output_node;
 extern vlib_node_registration_t mpls_midchain_node;
 
@@ -118,48 +114,6 @@
 
 int mpls_fib_reset_labels (u32 fib_id);
 
-#define foreach_mpls_input_next			\
-_(DROP, "error-drop")                           \
-_(LOOKUP, "mpls-lookup")
-
-typedef enum {
-#define _(s,n) MPLS_INPUT_NEXT_##s,
-  foreach_mpls_input_next
-#undef _
-  MPLS_INPUT_N_NEXT,
-} mpls_input_next_t;
-
-#define foreach_mpls_lookup_next        	\
-_(DROP, "error-drop")                           \
-_(IP4_INPUT, "ip4-input")                       \
-_(L2_OUTPUT, "l2-output")
-
-// FIXME remove.
-typedef enum {
-#define _(s,n) MPLS_LOOKUP_NEXT_##s,
-  foreach_mpls_lookup_next
-#undef _
-  MPLS_LOOKUP_N_NEXT,
-} mpls_lookup_next_t;
-
-#define foreach_mpls_output_next        	\
-_(DROP, "error-drop")
-
-typedef enum {
-#define _(s,n) MPLS_OUTPUT_NEXT_##s,
-  foreach_mpls_output_next
-#undef _
-  MPLS_OUTPUT_N_NEXT,
-} mpls_output_next_t;
-
-typedef struct {
-  u32 fib_index;
-  u32 entry_index;
-  u32 dest;
-  u32 s_bit;
-  u32 label;
-} show_mpls_fib_t;
-
 int
 mpls_dest_cmp(void * a1, void * a2);
 
diff --git a/src/vnet/mpls/node.c b/src/vnet/mpls/mpls_input.c
similarity index 82%
rename from src/vnet/mpls/node.c
rename to src/vnet/mpls/mpls_input.c
index 5b407fa..893c451 100644
--- a/src/vnet/mpls/node.c
+++ b/src/vnet/mpls/mpls_input.c
@@ -22,9 +22,20 @@
 
 typedef struct {
   u32 next_index;
-  u32 label_host_byte_order;
+  u32 label_net_byte_order;
 } mpls_input_trace_t;
 
+#define foreach_mpls_input_next			\
+_(DROP, "error-drop")                           \
+_(LOOKUP, "mpls-lookup")
+
+typedef enum {
+#define _(s,n) MPLS_INPUT_NEXT_##s,
+  foreach_mpls_input_next
+#undef _
+  MPLS_INPUT_N_NEXT,
+} mpls_input_next_t;
+
 static u8 *
 format_mpls_input_trace (u8 * s, va_list * args)
 {
@@ -32,8 +43,9 @@
   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
   mpls_input_trace_t * t = va_arg (*args, mpls_input_trace_t *);
   char * next_name;
-
+  u32 label;
   next_name = "BUG!";
+  label = clib_net_to_host_u32(t->label_net_byte_order);
 
 #define _(a,b) if (t->next_index == MPLS_INPUT_NEXT_##a) next_name = b;
   foreach_mpls_input_next;
@@ -41,8 +53,8 @@
   
   s = format (s, "MPLS: next %s[%d]  label %d ttl %d", 
               next_name, t->next_index,
-	      vnet_mpls_uc_get_label(t->label_host_byte_order),
-	      vnet_mpls_uc_get_ttl(t->label_host_byte_order));
+	      vnet_mpls_uc_get_label(label),
+	      vnet_mpls_uc_get_ttl(label));
 
   return s;
 }
@@ -88,30 +100,29 @@
       u32 n_left_to_next;
 
       vlib_get_next_frame (vm, node, next_index,
-			   to_next, n_left_to_next);
+                           to_next, n_left_to_next);
 
       while (n_left_from >= 4 && n_left_to_next >= 2)
         {
-          u32 label0, bi0, next0, sw_if_index0;
-          u32 label1, bi1, next1, sw_if_index1;
-          mpls_unicast_header_t *h0, *h1;
+          u32 bi0, next0, sw_if_index0;
+          u32 bi1, next1, sw_if_index1;
           vlib_buffer_t *b0, *b1;
+          char *h0, *h1;
 
           /* Prefetch next iteration. */
           {
-            vlib_buffer_t * p2, * p3;
+              vlib_buffer_t * p2, * p3;
 
-            p2 = vlib_get_buffer (vm, from[2]);
-            p3 = vlib_get_buffer (vm, from[3]);
+              p2 = vlib_get_buffer (vm, from[2]);
+              p3 = vlib_get_buffer (vm, from[3]);
 
-            vlib_prefetch_buffer_header (p2, STORE);
-            vlib_prefetch_buffer_header (p3, STORE);
+              vlib_prefetch_buffer_header (p2, STORE);
+              vlib_prefetch_buffer_header (p3, STORE);
 
-            CLIB_PREFETCH (p2->data, sizeof (h0[0]), STORE);
-            CLIB_PREFETCH (p3->data, sizeof (h1[0]), STORE);
+              CLIB_PREFETCH (p2->data, sizeof (h0[0]), STORE);
+              CLIB_PREFETCH (p3->data, sizeof (h1[0]), STORE);
           }
 
-
           bi0 = to_next[0] = from[0];
           bi1 = to_next[1] = from[1];
 
@@ -129,62 +140,59 @@
           sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
           sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX];
 
-          label0 = clib_net_to_host_u32 (h0->label_exp_s_ttl);
-          label1 = clib_net_to_host_u32 (h1->label_exp_s_ttl);
-
           /* TTL expired? */
-          if (PREDICT_FALSE(vnet_mpls_uc_get_ttl (label0) == 0))
-           {
+          if (PREDICT_FALSE(h0[3] == 0))
+          {
               next0 = MPLS_INPUT_NEXT_DROP;
               b0->error = node->errors[MPLS_ERROR_TTL_EXPIRED];
-            }
+          }
           else
-            {
+          {
               next0 = MPLS_INPUT_NEXT_LOOKUP;
-              vnet_feature_arc_start(mm->input_feature_arc_index, sw_if_index0, &next0, b0);
+              vnet_feature_arc_start(mm->input_feature_arc_index,
+                                     sw_if_index0, &next0, b0);
               vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1);
-            }
+          }
 
-          if (PREDICT_FALSE(vnet_mpls_uc_get_ttl (label1) == 0))
-           {
+          if (PREDICT_FALSE(h1[3] == 0))
+          {
               next1 = MPLS_INPUT_NEXT_DROP;
               b1->error = node->errors[MPLS_ERROR_TTL_EXPIRED];
-            }
+          }
           else
-            {
+          {
               next1 = MPLS_INPUT_NEXT_LOOKUP;
-              vnet_feature_arc_start(mm->input_feature_arc_index, sw_if_index1, &next1, b1);
+              vnet_feature_arc_start(mm->input_feature_arc_index,
+                                     sw_if_index1, &next1, b1);
               vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1);
-            }
+          }
 
           if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
-            {
+          {
               mpls_input_trace_t *tr = vlib_add_trace (vm, node,
                                                        b0, sizeof (*tr));
               tr->next_index = next0;
-              tr->label_host_byte_order = label0;
-            }
+              tr->label_net_byte_order = *((u32*)h0);
+          }
           if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED))
-            {
+          {
               mpls_input_trace_t *tr = vlib_add_trace (vm, node,
                                                        b1, sizeof (*tr));
               tr->next_index = next1;
-              tr->label_host_byte_order = label1;
-            }
+              tr->label_net_byte_order = *((u32*)h1);
+          }
 
           vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
                                            to_next, n_left_to_next,
-                                           bi0, bi1, next0, next1);
+                                           bi0, bi1,
+                                           next0, next1);
         }
 
       while (n_left_from > 0 && n_left_to_next > 0)
 	{
-	  u32 bi0;
+          u32 sw_if_index0, next0, bi0;
 	  vlib_buffer_t * b0;
-	  mpls_unicast_header_t * h0;
-          u32 label0;
-	  u32 next0 = 0;
-          u32 sw_if_index0;
+	  char * h0;
 
 	  bi0 = from[0];
 	  to_next[0] = bi0;
@@ -197,9 +205,8 @@
           h0 = vlib_buffer_get_current (b0);
 	  sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
 
-	  label0 = clib_net_to_host_u32 (h0->label_exp_s_ttl);
 	  /* TTL expired? */
-	  if (PREDICT_FALSE(vnet_mpls_uc_get_ttl (label0) == 0))
+	  if (PREDICT_FALSE(h0[3] == 0))
            {
               next0 = MPLS_INPUT_NEXT_DROP;
               b0->error = node->errors[MPLS_ERROR_TTL_EXPIRED];
@@ -216,7 +223,7 @@
               mpls_input_trace_t *tr = vlib_add_trace (vm, node, 
 						       b0, sizeof (*tr));
               tr->next_index = next0;
-              tr->label_host_byte_order = label0;
+              tr->label_net_byte_order = *(u32*)h0;
             }
 
 	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
diff --git a/src/vnet/mpls/mpls_lookup.c b/src/vnet/mpls/mpls_lookup.c
index 2d34cbd..475bb20 100644
--- a/src/vnet/mpls/mpls_lookup.c
+++ b/src/vnet/mpls/mpls_lookup.c
@@ -80,7 +80,7 @@
       vlib_get_next_frame (vm, node, next_index,
                            to_next, n_left_to_next);
 
-      while (n_left_from >= 4 && n_left_to_next >= 2)
+      while (n_left_from >= 8 && n_left_to_next >= 4)
         {
           u32 lbi0, next0, lfib_index0, bi0, hash_c0;
           const mpls_unicast_header_t * h0;
@@ -92,46 +92,79 @@
           const load_balance_t *lb1;
           const dpo_id_t *dpo1;
           vlib_buffer_t * b1;
+          u32 lbi2, next2, lfib_index2, bi2, hash_c2;
+          const mpls_unicast_header_t * h2;
+          const load_balance_t *lb2;
+          const dpo_id_t *dpo2;
+          vlib_buffer_t * b2;
+          u32 lbi3, next3, lfib_index3, bi3, hash_c3;
+          const mpls_unicast_header_t * h3;
+          const load_balance_t *lb3;
+          const dpo_id_t *dpo3;
+          vlib_buffer_t * b3;
 
            /* Prefetch next iteration. */
           {
-            vlib_buffer_t * p2, * p3;
+              vlib_buffer_t * p2, * p3, *p4, *p5;
 
             p2 = vlib_get_buffer (vm, from[2]);
             p3 = vlib_get_buffer (vm, from[3]);
+            p4 = vlib_get_buffer (vm, from[4]);
+            p5 = vlib_get_buffer (vm, from[5]);
 
             vlib_prefetch_buffer_header (p2, STORE);
             vlib_prefetch_buffer_header (p3, STORE);
+            vlib_prefetch_buffer_header (p4, STORE);
+            vlib_prefetch_buffer_header (p5, STORE);
 
             CLIB_PREFETCH (p2->data, sizeof (h0[0]), STORE);
             CLIB_PREFETCH (p3->data, sizeof (h0[0]), STORE);
+            CLIB_PREFETCH (p4->data, sizeof (h0[0]), STORE);
+            CLIB_PREFETCH (p5->data, sizeof (h0[0]), STORE);
           }
 
           bi0 = to_next[0] = from[0];
           bi1 = to_next[1] = from[1];
+          bi2 = to_next[2] = from[2];
+          bi3 = to_next[3] = from[3];
 
-          from += 2;
-          n_left_from -= 2;
-          to_next += 2;
-          n_left_to_next -= 2;
+          from += 4;
+          n_left_from -= 4;
+          to_next += 4;
+          n_left_to_next -= 4;
 
           b0 = vlib_get_buffer (vm, bi0);
           b1 = vlib_get_buffer (vm, bi1);
+          b2 = vlib_get_buffer (vm, bi2);
+          b3 = vlib_get_buffer (vm, bi3);
           h0 = vlib_buffer_get_current (b0);
           h1 = vlib_buffer_get_current (b1);
+          h2 = vlib_buffer_get_current (b2);
+          h3 = vlib_buffer_get_current (b3);
 
           lfib_index0 = vec_elt(mm->fib_index_by_sw_if_index,
                                 vnet_buffer(b0)->sw_if_index[VLIB_RX]);
           lfib_index1 = vec_elt(mm->fib_index_by_sw_if_index,
                                 vnet_buffer(b1)->sw_if_index[VLIB_RX]);
+          lfib_index2 = vec_elt(mm->fib_index_by_sw_if_index,
+                                vnet_buffer(b2)->sw_if_index[VLIB_RX]);
+          lfib_index3 = vec_elt(mm->fib_index_by_sw_if_index,
+                                vnet_buffer(b3)->sw_if_index[VLIB_RX]);
 
           lbi0 = mpls_fib_table_forwarding_lookup (lfib_index0, h0);
           lbi1 = mpls_fib_table_forwarding_lookup (lfib_index1, h1);
+          lbi2 = mpls_fib_table_forwarding_lookup (lfib_index2, h2);
+          lbi3 = mpls_fib_table_forwarding_lookup (lfib_index3, h3);
+
           lb0 = load_balance_get(lbi0);
           lb1 = load_balance_get(lbi1);
+          lb2 = load_balance_get(lbi2);
+          lb3 = load_balance_get(lbi3);
 
           hash_c0 = vnet_buffer(b0)->ip.flow_hash = 0;
           hash_c1 = vnet_buffer(b1)->ip.flow_hash = 0;
+          hash_c2 = vnet_buffer(b2)->ip.flow_hash = 0;
+          hash_c3 = vnet_buffer(b3)->ip.flow_hash = 0;
 
           if (PREDICT_FALSE(lb0->lb_n_buckets > 1))
           {
@@ -143,11 +176,25 @@
               hash_c1 = vnet_buffer (b1)->ip.flow_hash =
                   mpls_compute_flow_hash(h1, lb1->lb_hash_config);
           }
+          if (PREDICT_FALSE(lb2->lb_n_buckets > 1))
+          {
+              hash_c2 = vnet_buffer (b2)->ip.flow_hash =
+                  mpls_compute_flow_hash(h2, lb2->lb_hash_config);
+          }
+          if (PREDICT_FALSE(lb3->lb_n_buckets > 1))
+          {
+              hash_c3 = vnet_buffer (b3)->ip.flow_hash =
+                  mpls_compute_flow_hash(h3, lb3->lb_hash_config);
+          }
 
           ASSERT (lb0->lb_n_buckets > 0);
           ASSERT (is_pow2 (lb0->lb_n_buckets));
           ASSERT (lb1->lb_n_buckets > 0);
           ASSERT (is_pow2 (lb1->lb_n_buckets));
+          ASSERT (lb2->lb_n_buckets > 0);
+          ASSERT (is_pow2 (lb2->lb_n_buckets));
+          ASSERT (lb3->lb_n_buckets > 0);
+          ASSERT (is_pow2 (lb3->lb_n_buckets));
 
           dpo0 = load_balance_get_bucket_i(lb0,
                                            (hash_c0 &
@@ -155,12 +202,22 @@
           dpo1 = load_balance_get_bucket_i(lb1,
                                            (hash_c1 &
                                             (lb1->lb_n_buckets_minus_1)));
+          dpo2 = load_balance_get_bucket_i(lb2,
+                                           (hash_c2 &
+                                            (lb2->lb_n_buckets_minus_1)));
+          dpo3 = load_balance_get_bucket_i(lb3,
+                                           (hash_c3 &
+                                            (lb3->lb_n_buckets_minus_1)));
 
           next0 = dpo0->dpoi_next_node;
           next1 = dpo1->dpoi_next_node;
+          next2 = dpo2->dpoi_next_node;
+          next3 = dpo3->dpoi_next_node;
 
           vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
           vnet_buffer (b1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
+          vnet_buffer (b2)->ip.adj_index[VLIB_TX] = dpo2->dpoi_index;
+          vnet_buffer (b3)->ip.adj_index[VLIB_TX] = dpo3->dpoi_index;
 
           vlib_increment_combined_counter
               (cm, cpu_index, lbi0, 1,
@@ -168,6 +225,12 @@
           vlib_increment_combined_counter
               (cm, cpu_index, lbi1, 1,
                vlib_buffer_length_in_chain (vm, b1));
+          vlib_increment_combined_counter
+              (cm, cpu_index, lbi2, 1,
+               vlib_buffer_length_in_chain (vm, b2));
+          vlib_increment_combined_counter
+              (cm, cpu_index, lbi3, 1,
+               vlib_buffer_length_in_chain (vm, b3));
 
           /*
            * before we pop the label copy th values we need to maintain.
@@ -181,12 +244,20 @@
           vnet_buffer (b1)->mpls.ttl = ((char*)h1)[3];
           vnet_buffer (b1)->mpls.exp = (((char*)h1)[2] & 0xe) >> 1;
           vnet_buffer (b1)->mpls.first = 1;
+          vnet_buffer (b2)->mpls.ttl = ((char*)h2)[3];
+          vnet_buffer (b2)->mpls.exp = (((char*)h2)[2] & 0xe) >> 1;
+          vnet_buffer (b2)->mpls.first = 1;
+          vnet_buffer (b3)->mpls.ttl = ((char*)h3)[3];
+          vnet_buffer (b3)->mpls.exp = (((char*)h3)[2] & 0xe) >> 1;
+          vnet_buffer (b3)->mpls.first = 1;
 
           /*
            * pop the label that was just used in the lookup
            */
           vlib_buffer_advance(b0, sizeof(*h0));
           vlib_buffer_advance(b1, sizeof(*h1));
+          vlib_buffer_advance(b2, sizeof(*h2));
+          vlib_buffer_advance(b3, sizeof(*h3));
 
           if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
           {
@@ -210,9 +281,32 @@
               tr->label_net_byte_order = h1->label_exp_s_ttl;
           }
 
-          vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+          if (PREDICT_FALSE(b2->flags & VLIB_BUFFER_IS_TRACED))
+          {
+              mpls_lookup_trace_t *tr = vlib_add_trace (vm, node,
+                                                        b2, sizeof (*tr));
+              tr->next_index = next2;
+              tr->lb_index = lbi2;
+              tr->lfib_index = lfib_index2;
+              tr->hash = hash_c2;
+              tr->label_net_byte_order = h2->label_exp_s_ttl;
+          }
+
+          if (PREDICT_FALSE(b3->flags & VLIB_BUFFER_IS_TRACED))
+          {
+              mpls_lookup_trace_t *tr = vlib_add_trace (vm, node,
+                                                        b3, sizeof (*tr));
+              tr->next_index = next3;
+              tr->lb_index = lbi3;
+              tr->lfib_index = lfib_index3;
+              tr->hash = hash_c3;
+              tr->label_net_byte_order = h3->label_exp_s_ttl;
+          }
+
+          vlib_validate_buffer_enqueue_x4 (vm, node, next_index,
                                            to_next, n_left_to_next,
-                                           bi0, bi1, next0, next1);
+                                           bi0, bi1, bi2, bi3,
+                                           next0, next1, next2, next3);
         }
 
       while (n_left_from > 0 && n_left_to_next > 0)
@@ -361,10 +455,9 @@
 
       while (n_left_from >= 4 && n_left_to_next >= 2)
         {
-          mpls_lookup_next_t next0, next1;
           const load_balance_t *lb0, *lb1;
           vlib_buffer_t * p0, *p1;
-          u32 pi0, lbi0, hc0, pi1, lbi1, hc1;
+          u32 pi0, lbi0, hc0, pi1, lbi1, hc1, next0, next1;
           const mpls_unicast_header_t *mpls0, *mpls1;
           const dpo_id_t *dpo0, *dpo1;
 
@@ -465,10 +558,9 @@
 
       while (n_left_from > 0 && n_left_to_next > 0)
         {
-          mpls_lookup_next_t next0;
           const load_balance_t *lb0;
           vlib_buffer_t * p0;
-          u32 pi0, lbi0, hc0;
+          u32 pi0, lbi0, hc0, next0;
           const mpls_unicast_header_t *mpls0;
           const dpo_id_t *dpo0;
 
diff --git a/src/vnet/mpls/mpls_output.c b/src/vnet/mpls/mpls_output.c
index cf35400..2d8bd0c 100644
--- a/src/vnet/mpls/mpls_output.c
+++ b/src/vnet/mpls/mpls_output.c
@@ -29,6 +29,16 @@
   u8 packet_data[64 - 1*sizeof(u32)];
 } mpls_output_trace_t;
 
+#define foreach_mpls_output_next        	\
+_(DROP, "error-drop")
+
+typedef enum {
+#define _(s,n) MPLS_OUTPUT_NEXT_##s,
+  foreach_mpls_output_next
+#undef _
+  MPLS_OUTPUT_N_NEXT,
+} mpls_output_next_t;
+
 static u8 *
 format_mpls_output_trace (u8 * s, va_list * args)
 {