Revert "L2-input: use vlib_buffer_enqueue_to_next"

The patch did not improve single input link performance and 
degrade performance with multiple input links.

This reverts commit b1232555e91b286feab5667b5a22f29aa8e96626.

Change-Id: Ib9336a2e0610088b9145a43cacbdadb52dabd522
Signed-off-by: John Lo <loj@cisco.com>
diff --git a/src/vnet/ethernet/packet.h b/src/vnet/ethernet/packet.h
index 04ce420..d70960b 100644
--- a/src/vnet/ethernet/packet.h
+++ b/src/vnet/ethernet/packet.h
@@ -64,20 +64,20 @@
 
 /* I/G bit: individual (unicast)/group (broadcast/multicast). */
 always_inline uword
-ethernet_address_cast (const u8 * a)
+ethernet_address_cast (u8 * a)
 {
   return (a[0] >> 0) & 1;
 }
 
 always_inline int
-ethernet_address_is_broadcast (const u8 * a)
+ethernet_address_is_broadcast (u8 * a)
 {
   return clib_mem_unaligned (a, u32) == 0xffffffff &&
     clib_mem_unaligned (a + 4, u16) == 0xffff;
 }
 
 always_inline uword
-ethernet_address_is_locally_administered (const u8 * a)
+ethernet_address_is_locally_administered (u8 * a)
 {
   return (a[0] >> 1) & 1;
 }
diff --git a/src/vnet/l2/l2_input.c b/src/vnet/l2/l2_input.c
index fbf4a6b..7e41c88 100644
--- a/src/vnet/l2/l2_input.c
+++ b/src/vnet/l2/l2_input.c
@@ -138,85 +138,9 @@
   L2INPUT_N_NEXT,
 } l2input_next_t;
 
-static_always_inline u32
-l2_input_classiy_unicast (vlib_buffer_t * b, const ethernet_header_t * h0)
-{
-  u32 feat_mask = ~0;
-  u8 *l3h0 = (u8 *) h0 + vnet_buffer (b)->l2.l2_len;
-
-#define get_u16(addr) ( *((u16 *)(addr)) )
-  u16 ethertype = clib_net_to_host_u16 (get_u16 (l3h0 - 2));
-  u8 protocol = ((ip6_header_t *) l3h0)->protocol;
-
-  /* Disable bridge forwarding (flooding will execute instead if not xconnect) */
-  feat_mask &= ~(L2INPUT_FEAT_FWD |
-		 L2INPUT_FEAT_UU_FLOOD | L2INPUT_FEAT_GBP_FWD);
-
-  /* Disable ARP-term for non-ARP and non-ICMP6 packet */
-  if (ethertype != ETHERNET_TYPE_ARP &&
-      (ethertype != ETHERNET_TYPE_IP6 || protocol != IP_PROTOCOL_ICMP6))
-    feat_mask &= ~(L2INPUT_FEAT_ARP_TERM);
-
-  /*
-   * For packet from BVI - set SHG of ARP request or ICMPv6 neighbor
-   * solicitation packet from BVI to 0 so it can also flood to VXLAN
-   * tunnels or other ports with the same SHG as that of the BVI.
-   */
-  else if (PREDICT_FALSE (vnet_buffer (b)->sw_if_index[VLIB_TX] ==
-			  L2INPUT_BVI))
-    {
-      if (ethertype == ETHERNET_TYPE_ARP)
-	{
-	  ethernet_arp_header_t *arp0 = (ethernet_arp_header_t *) l3h0;
-	  if (arp0->opcode ==
-	      clib_host_to_net_u16 (ETHERNET_ARP_OPCODE_request))
-	    vnet_buffer (b)->l2.shg = 0;
-	}
-      else			/* must be ICMPv6 */
-	{
-	  ip6_header_t *iph0 = (ip6_header_t *) l3h0;
-	  icmp6_neighbor_solicitation_or_advertisement_header_t *ndh0;
-	  ndh0 = ip6_next_header (iph0);
-	  if (ndh0->icmp.type == ICMP6_neighbor_solicitation)
-	    vnet_buffer (b)->l2.shg = 0;
-	}
-    }
-
-  return (feat_mask);
-}
 
 static_always_inline void
-l2_input_classify_bridge (l2_bridge_domain_t * bd_config,
-			  u16 bd_index,
-			  u32 sw_if_index, vlib_buffer_t * b, u32 * feat_mask)
-{
-  /* save BD ID for next feature graph nodes */
-  vnet_buffer (b)->l2.bd_index = bd_index;
-
-  /* Save bridge domain and interface seq_num */
-  /* *INDENT-OFF* */
-  l2fib_seq_num_t sn = {
-    .swif = *l2fib_swif_seq_num(sw_if_index),
-    .bd = bd_config->seq_num,
-  };
-  /* *INDENT-ON* */
-  vnet_buffer (b)->l2.l2fib_sn = sn.as_u16;;
-  vnet_buffer (b)->l2.bd_age = bd_config->mac_age;
-
-  /*
-   * Process bridge domain feature enables.
-   * To perform learning/flooding/forwarding, the corresponding bit
-   * must be enabled in both the input interface config and in the
-   * bridge domain config. In the bd_bitmap, bits for features other
-   * than learning/flooding/forwarding should always be set.
-   */
-  *feat_mask = *feat_mask & bd_config->feature_bitmap;
-}
-
-static_always_inline void
-classify_and_dispatch (l2input_main_t * msm,
-		       u16 n_left,
-		       u32 sw_if_index, vlib_buffer_t ** b, u16 * next)
+classify_and_dispatch (l2input_main_t * msm, vlib_buffer_t * b0, u32 * next0)
 {
   /*
    * Load L2 input feature struct
@@ -232,202 +156,119 @@
    *   set tx sw-if-handle
    */
 
-  l2_input_config_t *config;
+  u32 feat_mask = ~0;
+  u32 sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
+  ethernet_header_t *h0 = vlib_buffer_get_current (b0);
 
   /* Get config for the input interface */
-  config = vec_elt_at_index (msm->configs, sw_if_index);
+  l2_input_config_t *config = vec_elt_at_index (msm->configs, sw_if_index0);
 
-  while (n_left > 8)
+  /* Save split horizon group */
+  vnet_buffer (b0)->l2.shg = config->shg;
+
+  /* determine layer2 kind for stat and mask */
+  if (PREDICT_FALSE (ethernet_address_cast (h0->dst_address)))
     {
-      const ethernet_header_t *h0, *h1, *h2, *h3;
-      u32 fm0, fm1, fm2, fm3;
-      u32 fb0, fb1, fb2, fb3;
+      u8 *l3h0 = (u8 *) h0 + vnet_buffer (b0)->l2.l2_len;
 
-      if (n_left >= 4)
-	{
-	  vlib_prefetch_buffer_header (b[0], LOAD);
-	  vlib_prefetch_buffer_header (b[1], LOAD);
-	  vlib_prefetch_buffer_header (b[2], LOAD);
-	  vlib_prefetch_buffer_header (b[3], LOAD);
-	}
+#define get_u16(addr) ( *((u16 *)(addr)) )
+      u16 ethertype = clib_net_to_host_u16 (get_u16 (l3h0 - 2));
+      u8 protocol = ((ip6_header_t *) l3h0)->protocol;
 
-      fm0 = fm1 = fm2 = fm3 = ~0;
-      h0 = vlib_buffer_get_current (b[0]);
-      h1 = vlib_buffer_get_current (b[1]);
-      h2 = vlib_buffer_get_current (b[2]);
-      h3 = vlib_buffer_get_current (b[3]);
+      /* Disable bridge forwarding (flooding will execute instead if not xconnect) */
+      feat_mask &= ~(L2INPUT_FEAT_FWD |
+		     L2INPUT_FEAT_UU_FLOOD | L2INPUT_FEAT_GBP_FWD);
 
-      /* Save split horizon group */
-      vnet_buffer (b[0])->l2.shg = config->shg;
-      vnet_buffer (b[1])->l2.shg = config->shg;
-      vnet_buffer (b[2])->l2.shg = config->shg;
-      vnet_buffer (b[3])->l2.shg = config->shg;
+      /* Disable ARP-term for non-ARP and non-ICMP6 packet */
+      if (ethertype != ETHERNET_TYPE_ARP &&
+	  (ethertype != ETHERNET_TYPE_IP6 || protocol != IP_PROTOCOL_ICMP6))
+	feat_mask &= ~(L2INPUT_FEAT_ARP_TERM);
 
-      /* determine layer2 kind for stat and mask */
-      if (PREDICT_FALSE (ethernet_address_cast (h0->dst_address)))
+      /*
+       * For packet from BVI - set SHG of ARP request or ICMPv6 neighbor
+       * solicitation packet from BVI to 0 so it can also flood to VXLAN
+       * tunnels or other ports with the same SHG as that of the BVI.
+       */
+      else if (PREDICT_FALSE (vnet_buffer (b0)->sw_if_index[VLIB_TX] ==
+			      L2INPUT_BVI))
 	{
-	  fm0 = l2_input_classiy_unicast (b[0], h0);
+	  if (ethertype == ETHERNET_TYPE_ARP)
+	    {
+	      ethernet_arp_header_t *arp0 = (ethernet_arp_header_t *) l3h0;
+	      if (arp0->opcode ==
+		  clib_host_to_net_u16 (ETHERNET_ARP_OPCODE_request))
+		vnet_buffer (b0)->l2.shg = 0;
+	    }
+	  else			/* must be ICMPv6 */
+	    {
+	      ip6_header_t *iph0 = (ip6_header_t *) l3h0;
+	      icmp6_neighbor_solicitation_or_advertisement_header_t *ndh0;
+	      ndh0 = ip6_next_header (iph0);
+	      if (ndh0->icmp.type == ICMP6_neighbor_solicitation)
+		vnet_buffer (b0)->l2.shg = 0;
+	    }
 	}
-      else
-	{
-	  if (PREDICT_FALSE (sw_if_index == L2INPUT_BVI))
-	    vnet_buffer (b[0])->l2.shg = 0;
-	}
-      if (PREDICT_FALSE (ethernet_address_cast (h1->dst_address)))
-	{
-	  fm1 = l2_input_classiy_unicast (b[1], h1);
-	}
-      else
-	{
-	  if (PREDICT_FALSE (sw_if_index == L2INPUT_BVI))
-	    vnet_buffer (b[1])->l2.shg = 0;
-	}
-      if (PREDICT_FALSE (ethernet_address_cast (h2->dst_address)))
-	{
-	  fm2 = l2_input_classiy_unicast (b[2], h2);
-	}
-      else
-	{
-	  if (PREDICT_FALSE (sw_if_index == L2INPUT_BVI))
-	    vnet_buffer (b[1])->l2.shg = 0;
-	}
-      if (PREDICT_FALSE (ethernet_address_cast (h3->dst_address)))
-	{
-	  fm3 = l2_input_classiy_unicast (b[3], h3);
-	}
-      else
-	{
-	  if (PREDICT_FALSE (sw_if_index == L2INPUT_BVI))
-	    vnet_buffer (b[1])->l2.shg = 0;
-	}
-
-      if (config->bridge)
-	{
-	  /* Do bridge-domain processing */
-	  l2_bridge_domain_t *bd_config;
-	  u16 bd_index;
-
-	  bd_index = config->bd_index;
-	  bd_config = vec_elt_at_index (msm->bd_configs, bd_index);
-
-	  l2_input_classify_bridge (bd_config, bd_index, sw_if_index,
-				    b[0], &fm0);
-	  l2_input_classify_bridge (bd_config, bd_index, sw_if_index,
-				    b[1], &fm1);
-	  l2_input_classify_bridge (bd_config, bd_index, sw_if_index,
-				    b[2], &fm2);
-	  l2_input_classify_bridge (bd_config, bd_index, sw_if_index,
-				    b[3], &fm3);
-	}
-      else if (config->xconnect)
-	{
-	  /* Set the output interface */
-	  vnet_buffer (b[0])->sw_if_index[VLIB_TX] =
-	    config->output_sw_if_index;
-	  vnet_buffer (b[1])->sw_if_index[VLIB_TX] =
-	    config->output_sw_if_index;
-	  vnet_buffer (b[2])->sw_if_index[VLIB_TX] =
-	    config->output_sw_if_index;
-	  vnet_buffer (b[3])->sw_if_index[VLIB_TX] =
-	    config->output_sw_if_index;
-	}
-      else
-	{
-	  fm0 = L2INPUT_FEAT_DROP;
-	  fm1 = L2INPUT_FEAT_DROP;
-	  fm2 = L2INPUT_FEAT_DROP;
-	  fm3 = L2INPUT_FEAT_DROP;
-	}
-
-      /* mask out features from bitmap using packet type and bd config */
-      fb0 = config->feature_bitmap & fm0;
-      fb1 = config->feature_bitmap & fm1;
-      fb2 = config->feature_bitmap & fm2;
-      fb3 = config->feature_bitmap & fm3;
-
-      /* save for next feature graph nodes */
-      vnet_buffer (b[0])->l2.feature_bitmap = fb0;
-      vnet_buffer (b[1])->l2.feature_bitmap = fb1;
-      vnet_buffer (b[2])->l2.feature_bitmap = fb2;
-      vnet_buffer (b[3])->l2.feature_bitmap = fb3;
-
-      /* Determine the next node */
-      *next = feat_bitmap_get_next_node_index (msm->feat_next_node_index,
-					       fb0);
-      next++;
-      *next = feat_bitmap_get_next_node_index (msm->feat_next_node_index,
-					       fb1);
-      next++;
-      *next = feat_bitmap_get_next_node_index (msm->feat_next_node_index,
-					       fb2);
-      next++;
-      *next = feat_bitmap_get_next_node_index (msm->feat_next_node_index,
-					       fb3);
-      next++;
-
-      b += 4;
-      n_left -= 4;
     }
-  while (n_left)
+  else
     {
-      const ethernet_header_t *h0;
-      u32 fm0, fb0;
-
-      fm0 = ~0;
-      h0 = vlib_buffer_get_current (b[0]);
-
-      /* Save split horizon group */
-      vnet_buffer (b[0])->l2.shg = config->shg;
-
-      /* determine layer2 kind for stat and mask */
-      if (PREDICT_FALSE (ethernet_address_cast (h0->dst_address)))
-	{
-	  fm0 = l2_input_classiy_unicast (b[0], h0);
-	}
-      else
-	{
-	  /*
-	   * For packet from BVI - set SHG of unicast packet from BVI to 0 so it
-	   * is not dropped on output to VXLAN tunnels or other ports with the
-	   * same SHG as that of the BVI.
-	   */
-	  if (PREDICT_FALSE (sw_if_index == L2INPUT_BVI))
-	    vnet_buffer (b[0])->l2.shg = 0;
-	}
-
-      if (config->bridge)
-	{
-	  /* Do bridge-domain processing */
-	  u16 bd_index = config->bd_index;
-	  l2_bridge_domain_t *bd_config =
-	    vec_elt_at_index (msm->bd_configs, bd_index);
-
-	  l2_input_classify_bridge (bd_config, bd_index, sw_if_index,
-				    b[0], &fm0);
-	}
-      else if (config->xconnect)
-	{
-	  /* Set the output interface */
-	  vnet_buffer (b[0])->sw_if_index[VLIB_TX] =
-	    config->output_sw_if_index;
-	}
-      else
-	fm0 = L2INPUT_FEAT_DROP;
-
-      /* mask out features from bitmap using packet type and bd config */
-      fb0 = config->feature_bitmap & fm0;
-
-      /* save for next feature graph nodes */
-      vnet_buffer (b[0])->l2.feature_bitmap = fb0;
-
-      /* Determine the next node */
-      *next = feat_bitmap_get_next_node_index (msm->feat_next_node_index,
-					       fb0);
-
-      next += 1;
-      b += 1;
-      n_left -= 1;
+      /*
+       * For packet from BVI - set SHG of unicast packet from BVI to 0 so it
+       * is not dropped on output to VXLAN tunnels or other ports with the
+       * same SHG as that of the BVI.
+       */
+      if (PREDICT_FALSE (vnet_buffer (b0)->sw_if_index[VLIB_TX] ==
+			 L2INPUT_BVI))
+	vnet_buffer (b0)->l2.shg = 0;
     }
+
+
+  if (config->bridge)
+    {
+      /* Do bridge-domain processing */
+      u16 bd_index0 = config->bd_index;
+      /* save BD ID for next feature graph nodes */
+      vnet_buffer (b0)->l2.bd_index = bd_index0;
+
+      /* Get config for the bridge domain interface */
+      l2_bridge_domain_t *bd_config =
+	vec_elt_at_index (msm->bd_configs, bd_index0);
+
+      /* Save bridge domain and interface seq_num */
+      /* *INDENT-OFF* */
+      l2fib_seq_num_t sn = {
+        .swif = *l2fib_swif_seq_num(sw_if_index0),
+	.bd = bd_config->seq_num,
+      };
+      /* *INDENT-ON* */
+      vnet_buffer (b0)->l2.l2fib_sn = sn.as_u16;;
+      vnet_buffer (b0)->l2.bd_age = bd_config->mac_age;
+
+      /*
+       * Process bridge domain feature enables.
+       * To perform learning/flooding/forwarding, the corresponding bit
+       * must be enabled in both the input interface config and in the
+       * bridge domain config. In the bd_bitmap, bits for features other
+       * than learning/flooding/forwarding should always be set.
+       */
+      feat_mask = feat_mask & bd_config->feature_bitmap;
+    }
+  else if (config->xconnect)
+    {
+      /* Set the output interface */
+      vnet_buffer (b0)->sw_if_index[VLIB_TX] = config->output_sw_if_index;
+    }
+  else
+    feat_mask = L2INPUT_FEAT_DROP;
+
+  /* mask out features from bitmap using packet type and bd config */
+  u32 feature_bitmap = config->feature_bitmap & feat_mask;
+
+  /* save for next feature graph nodes */
+  vnet_buffer (b0)->l2.feature_bitmap = feature_bitmap;
+
+  /* Determine the next node */
+  *next0 = feat_bitmap_get_next_node_index (msm->feat_next_node_index,
+					    feature_bitmap);
 }
 
 static_always_inline uword
@@ -435,88 +276,169 @@
 		     vlib_node_runtime_t * node, vlib_frame_t * frame,
 		     int do_trace)
 {
-  u32 n_left, *from;
+  u32 n_left_from, *from, *to_next;
+  l2input_next_t next_index;
   l2input_main_t *msm = &l2input_main;
-  vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
-  u16 nexts[VLIB_FRAME_SIZE];
-  u32 sw_if_indices[VLIB_FRAME_SIZE], *sw_if_index;
 
   from = vlib_frame_vector_args (frame);
-  n_left = frame->n_vectors;	/* number of packets to process */
-  vlib_get_buffers (vm, from, bufs, n_left);
-  b = bufs;
-  sw_if_index = sw_if_indices;
+  n_left_from = frame->n_vectors;	/* number of packets to process */
+  next_index = node->cached_next_index;
 
-  /* extract data from buffer metadata */
-  while (n_left >= 8)
+  while (n_left_from > 0)
     {
-      /* Prefetch the buffer header for the N+2 loop iteration */
-      vlib_prefetch_buffer_header (b[4], LOAD);
-      vlib_prefetch_buffer_header (b[5], LOAD);
-      vlib_prefetch_buffer_header (b[6], LOAD);
-      vlib_prefetch_buffer_header (b[7], LOAD);
+      u32 n_left_to_next;
 
-      sw_if_index[0] = vnet_buffer (b[0])->sw_if_index[VLIB_RX];
-      sw_if_index[1] = vnet_buffer (b[1])->sw_if_index[VLIB_RX];
-      sw_if_index[2] = vnet_buffer (b[2])->sw_if_index[VLIB_RX];
-      sw_if_index[3] = vnet_buffer (b[3])->sw_if_index[VLIB_RX];
+      /* get space to enqueue frame to graph node "next_index" */
+      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
 
-      /* next */
-      sw_if_index += 4;
-      n_left -= 4;
-      b += 4;
-    }
-  while (n_left)
-    {
-      sw_if_index[0] = vnet_buffer (b[0])->sw_if_index[VLIB_RX];
-
-      /* next */
-      sw_if_index += 1;
-      n_left -= 1;
-      b += 1;
-    }
-
-  n_left = frame->n_vectors;
-
-  while (n_left)
-    {
-      u16 count, *next;
-      u16 off = frame->n_vectors - n_left;
-      b = bufs + off;
-
-      sw_if_index = sw_if_indices + off;
-      next = nexts + off;
-
-      count = clib_count_equal_u32 (sw_if_index, n_left);
-      n_left -= count;
-
-      classify_and_dispatch (msm, count, sw_if_index[0], b, next);
-    }
-
-  if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
-    {
-      n_left = frame->n_vectors;	/* number of packets to process */
-      b = bufs;
-
-      while (n_left)
+      while (n_left_from >= 8 && n_left_to_next >= 4)
 	{
-	  if (PREDICT_FALSE (b[0]->flags & VLIB_BUFFER_IS_TRACED))
-	    {
-	      ethernet_header_t *h0 = vlib_buffer_get_current (b[0]);
-	      l2input_trace_t *t =
-		vlib_add_trace (vm, node, b[0], sizeof (*t));
+	  u32 bi0, bi1, bi2, bi3;
+	  vlib_buffer_t *b0, *b1, *b2, *b3;
+	  u32 next0, next1, next2, next3;
+	  u32 sw_if_index0, sw_if_index1, sw_if_index2, sw_if_index3;
 
-	      t->sw_if_index = vnet_buffer (b[0])->sw_if_index[VLIB_RX];
+	  /* Prefetch next iteration. */
+	  {
+	    vlib_buffer_t *p4, *p5, *p6, *p7;
+
+	    p4 = vlib_get_buffer (vm, from[4]);
+	    p5 = vlib_get_buffer (vm, from[5]);
+	    p6 = vlib_get_buffer (vm, from[6]);
+	    p7 = vlib_get_buffer (vm, from[7]);
+
+	    /* Prefetch the buffer header and packet for the N+2 loop iteration */
+	    vlib_prefetch_buffer_header (p4, LOAD);
+	    vlib_prefetch_buffer_header (p5, LOAD);
+	    vlib_prefetch_buffer_header (p6, LOAD);
+	    vlib_prefetch_buffer_header (p7, LOAD);
+
+	    CLIB_PREFETCH (p4->data, CLIB_CACHE_LINE_BYTES, STORE);
+	    CLIB_PREFETCH (p5->data, CLIB_CACHE_LINE_BYTES, STORE);
+	    CLIB_PREFETCH (p6->data, CLIB_CACHE_LINE_BYTES, STORE);
+	    CLIB_PREFETCH (p7->data, CLIB_CACHE_LINE_BYTES, STORE);
+
+	    /*
+	     * Don't bother prefetching the bridge-domain config (which
+	     * depends on the input config above). Only a small number of
+	     * bridge domains are expected. Plus the structure is small
+	     * and several fit in a cache line.
+	     */
+	  }
+
+	  /* speculatively enqueue b0 and b1 to the current next frame */
+	  /* bi is "buffer index", b is pointer to the buffer */
+	  to_next[0] = bi0 = from[0];
+	  to_next[1] = bi1 = from[1];
+	  to_next[2] = bi2 = from[2];
+	  to_next[3] = bi3 = from[3];
+	  from += 4;
+	  to_next += 4;
+	  n_left_from -= 4;
+	  n_left_to_next -= 4;
+
+	  b0 = vlib_get_buffer (vm, bi0);
+	  b1 = vlib_get_buffer (vm, bi1);
+	  b2 = vlib_get_buffer (vm, bi2);
+	  b3 = vlib_get_buffer (vm, bi3);
+
+	  if (do_trace)
+	    {
+	      /* RX interface handles */
+	      sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
+	      sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX];
+	      sw_if_index2 = vnet_buffer (b2)->sw_if_index[VLIB_RX];
+	      sw_if_index3 = vnet_buffer (b3)->sw_if_index[VLIB_RX];
+
+	      if (b0->flags & VLIB_BUFFER_IS_TRACED)
+		{
+		  ethernet_header_t *h0 = vlib_buffer_get_current (b0);
+		  l2input_trace_t *t =
+		    vlib_add_trace (vm, node, b0, sizeof (*t));
+		  t->sw_if_index = sw_if_index0;
+		  clib_memcpy (t->src, h0->src_address, 6);
+		  clib_memcpy (t->dst, h0->dst_address, 6);
+		}
+	      if (b1->flags & VLIB_BUFFER_IS_TRACED)
+		{
+		  ethernet_header_t *h1 = vlib_buffer_get_current (b1);
+		  l2input_trace_t *t =
+		    vlib_add_trace (vm, node, b1, sizeof (*t));
+		  t->sw_if_index = sw_if_index1;
+		  clib_memcpy (t->src, h1->src_address, 6);
+		  clib_memcpy (t->dst, h1->dst_address, 6);
+		}
+	      if (b2->flags & VLIB_BUFFER_IS_TRACED)
+		{
+		  ethernet_header_t *h2 = vlib_buffer_get_current (b2);
+		  l2input_trace_t *t =
+		    vlib_add_trace (vm, node, b2, sizeof (*t));
+		  t->sw_if_index = sw_if_index2;
+		  clib_memcpy (t->src, h2->src_address, 6);
+		  clib_memcpy (t->dst, h2->dst_address, 6);
+		}
+	      if (b3->flags & VLIB_BUFFER_IS_TRACED)
+		{
+		  ethernet_header_t *h3 = vlib_buffer_get_current (b3);
+		  l2input_trace_t *t =
+		    vlib_add_trace (vm, node, b3, sizeof (*t));
+		  t->sw_if_index = sw_if_index3;
+		  clib_memcpy (t->src, h3->src_address, 6);
+		  clib_memcpy (t->dst, h3->dst_address, 6);
+		}
+	    }
+
+	  classify_and_dispatch (msm, b0, &next0);
+	  classify_and_dispatch (msm, b1, &next1);
+	  classify_and_dispatch (msm, b2, &next2);
+	  classify_and_dispatch (msm, b3, &next3);
+
+	  /* verify speculative enqueues, maybe switch current next frame */
+	  /* if next0==next1==next_index then nothing special needs to be done */
+	  vlib_validate_buffer_enqueue_x4 (vm, node, next_index,
+					   to_next, n_left_to_next,
+					   bi0, bi1, bi2, bi3,
+					   next0, next1, next2, next3);
+	}
+
+      while (n_left_from > 0 && n_left_to_next > 0)
+	{
+	  u32 bi0;
+	  vlib_buffer_t *b0;
+	  u32 next0;
+	  u32 sw_if_index0;
+
+	  /* speculatively enqueue b0 to the current next frame */
+	  bi0 = from[0];
+	  to_next[0] = bi0;
+	  from += 1;
+	  to_next += 1;
+	  n_left_from -= 1;
+	  n_left_to_next -= 1;
+
+	  b0 = vlib_get_buffer (vm, bi0);
+
+	  if (do_trace && PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
+	    {
+	      ethernet_header_t *h0 = vlib_buffer_get_current (b0);
+	      l2input_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t));
+	      sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
+	      t->sw_if_index = sw_if_index0;
 	      clib_memcpy (t->src, h0->src_address, 6);
 	      clib_memcpy (t->dst, h0->dst_address, 6);
 	    }
-	  /* next */
-	  n_left--;
-	  b++;
+
+	  classify_and_dispatch (msm, b0, &next0);
+
+	  /* verify speculative enqueue, maybe switch current next frame */
+	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+					   to_next, n_left_to_next,
+					   bi0, next0);
 	}
+
+      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
     }
 
-  vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
   vlib_node_increment_counter (vm, l2input_node.index,
 			       L2INPUT_ERROR_L2INPUT, frame->n_vectors);