ip: ip4 lookup performance bump with the usual receipe

2.77e1 v. 2.81e1

Type: performance

Change-Id: I896ec77818603f17aaa622073dafc626570326f1
Signed-off-by: Neale Ranns <nranns@cisco.com>
diff --git a/src/vnet/ip/ip4_forward.h b/src/vnet/ip/ip4_forward.h
index b4492ce..4d52ad0 100644
--- a/src/vnet/ip/ip4_forward.h
+++ b/src/vnet/ip/ip4_forward.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Copyright (c) 2015-2019 Cisco and/or its affiliates.
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at:
@@ -59,458 +59,401 @@
 {
   ip4_main_t *im = &ip4_main;
   vlib_combined_counter_main_t *cm = &load_balance_main.lbm_to_counters;
-  u32 n_left_from, n_left_to_next, *from, *to_next;
-  ip_lookup_next_t next;
+  u32 n_left, *from;
   u32 thread_index = vm->thread_index;
   vlib_buffer_t *bufs[VLIB_FRAME_SIZE];
   vlib_buffer_t **b = bufs;
+  u16 nexts[VLIB_FRAME_SIZE], *next;
 
   from = vlib_frame_vector_args (frame);
-  n_left_from = frame->n_vectors;
-  next = node->cached_next_index;
-  vlib_get_buffers (vm, from, bufs, n_left_from);
-
-  while (n_left_from > 0)
-    {
-      vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
+  n_left = frame->n_vectors;
+  next = nexts;
+  vlib_get_buffers (vm, from, bufs, n_left);
 
 #if (CLIB_N_PREFETCHES >= 8)
-      while (n_left_from >= 8 && n_left_to_next >= 4)
+  while (n_left >= 4)
+    {
+      ip4_header_t *ip0, *ip1, *ip2, *ip3;
+      const load_balance_t *lb0, *lb1, *lb2, *lb3;
+      ip4_fib_mtrie_t *mtrie0, *mtrie1, *mtrie2, *mtrie3;
+      ip4_fib_mtrie_leaf_t leaf0, leaf1, leaf2, leaf3;
+      ip4_address_t *dst_addr0, *dst_addr1, *dst_addr2, *dst_addr3;
+      u32 lb_index0, lb_index1, lb_index2, lb_index3;
+      flow_hash_config_t flow_hash_config0, flow_hash_config1;
+      flow_hash_config_t flow_hash_config2, flow_hash_config3;
+      u32 hash_c0, hash_c1, hash_c2, hash_c3;
+      const dpo_id_t *dpo0, *dpo1, *dpo2, *dpo3;
+
+      /* Prefetch next iteration. */
+      if (n_left >= 8)
 	{
-	  vlib_buffer_t *p0, *p1, *p2, *p3;
-	  ip4_header_t *ip0, *ip1, *ip2, *ip3;
-	  ip_lookup_next_t next0, next1, next2, next3;
-	  const load_balance_t *lb0, *lb1, *lb2, *lb3;
-	  ip4_fib_mtrie_t *mtrie0, *mtrie1, *mtrie2, *mtrie3;
-	  ip4_fib_mtrie_leaf_t leaf0, leaf1, leaf2, leaf3;
-	  ip4_address_t *dst_addr0, *dst_addr1, *dst_addr2, *dst_addr3;
-	  u32 pi0, pi1, pi2, pi3, lb_index0, lb_index1, lb_index2, lb_index3;
-	  flow_hash_config_t flow_hash_config0, flow_hash_config1;
-	  flow_hash_config_t flow_hash_config2, flow_hash_config3;
-	  u32 hash_c0, hash_c1, hash_c2, hash_c3;
-	  const dpo_id_t *dpo0, *dpo1, *dpo2, *dpo3;
+	  vlib_prefetch_buffer_header (b[4], LOAD);
+	  vlib_prefetch_buffer_header (b[5], LOAD);
+	  vlib_prefetch_buffer_header (b[6], LOAD);
+	  vlib_prefetch_buffer_header (b[7], LOAD);
 
-	  /* Prefetch next iteration. */
-	  {
-	    vlib_prefetch_buffer_header (b[4], LOAD);
-	    vlib_prefetch_buffer_header (b[5], LOAD);
-	    vlib_prefetch_buffer_header (b[6], LOAD);
-	    vlib_prefetch_buffer_header (b[7], LOAD);
-
-	    CLIB_PREFETCH (b[4]->data, sizeof (ip0[0]), LOAD);
-	    CLIB_PREFETCH (b[5]->data, sizeof (ip0[0]), LOAD);
-	    CLIB_PREFETCH (b[6]->data, sizeof (ip0[0]), LOAD);
-	    CLIB_PREFETCH (b[7]->data, sizeof (ip0[0]), LOAD);
-	  }
-
-	  pi0 = to_next[0] = from[0];
-	  pi1 = to_next[1] = from[1];
-	  pi2 = to_next[2] = from[2];
-	  pi3 = to_next[3] = from[3];
-
-	  from += 4;
-	  to_next += 4;
-	  n_left_to_next -= 4;
-	  n_left_from -= 4;
-
-	  p0 = b[0];
-	  p1 = b[1];
-	  p2 = b[2];
-	  p3 = b[3];
-	  b += 4;
-
-	  ip0 = vlib_buffer_get_current (p0);
-	  ip1 = vlib_buffer_get_current (p1);
-	  ip2 = vlib_buffer_get_current (p2);
-	  ip3 = vlib_buffer_get_current (p3);
-
-	  dst_addr0 = &ip0->dst_address;
-	  dst_addr1 = &ip1->dst_address;
-	  dst_addr2 = &ip2->dst_address;
-	  dst_addr3 = &ip3->dst_address;
-
-	  ip_lookup_set_buffer_fib_index (im->fib_index_by_sw_if_index, p0);
-	  ip_lookup_set_buffer_fib_index (im->fib_index_by_sw_if_index, p1);
-	  ip_lookup_set_buffer_fib_index (im->fib_index_by_sw_if_index, p2);
-	  ip_lookup_set_buffer_fib_index (im->fib_index_by_sw_if_index, p3);
-
-	  if (!lookup_for_responses_to_locally_received_packets)
-	    {
-	      mtrie0 = &ip4_fib_get (vnet_buffer (p0)->ip.fib_index)->mtrie;
-	      mtrie1 = &ip4_fib_get (vnet_buffer (p1)->ip.fib_index)->mtrie;
-	      mtrie2 = &ip4_fib_get (vnet_buffer (p2)->ip.fib_index)->mtrie;
-	      mtrie3 = &ip4_fib_get (vnet_buffer (p3)->ip.fib_index)->mtrie;
-
-	      leaf0 = ip4_fib_mtrie_lookup_step_one (mtrie0, dst_addr0);
-	      leaf1 = ip4_fib_mtrie_lookup_step_one (mtrie1, dst_addr1);
-	      leaf2 = ip4_fib_mtrie_lookup_step_one (mtrie2, dst_addr2);
-	      leaf3 = ip4_fib_mtrie_lookup_step_one (mtrie3, dst_addr3);
-	    }
-
-	  if (!lookup_for_responses_to_locally_received_packets)
-	    {
-	      leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 2);
-	      leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 2);
-	      leaf2 = ip4_fib_mtrie_lookup_step (mtrie2, leaf2, dst_addr2, 2);
-	      leaf3 = ip4_fib_mtrie_lookup_step (mtrie3, leaf3, dst_addr3, 2);
-	    }
-
-	  if (!lookup_for_responses_to_locally_received_packets)
-	    {
-	      leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 3);
-	      leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 3);
-	      leaf2 = ip4_fib_mtrie_lookup_step (mtrie2, leaf2, dst_addr2, 3);
-	      leaf3 = ip4_fib_mtrie_lookup_step (mtrie3, leaf3, dst_addr3, 3);
-	    }
-
-	  if (lookup_for_responses_to_locally_received_packets)
-	    {
-	      lb_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_RX];
-	      lb_index1 = vnet_buffer (p1)->ip.adj_index[VLIB_RX];
-	      lb_index2 = vnet_buffer (p2)->ip.adj_index[VLIB_RX];
-	      lb_index3 = vnet_buffer (p3)->ip.adj_index[VLIB_RX];
-	    }
-	  else
-	    {
-	      lb_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
-	      lb_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
-	      lb_index2 = ip4_fib_mtrie_leaf_get_adj_index (leaf2);
-	      lb_index3 = ip4_fib_mtrie_leaf_get_adj_index (leaf3);
-	    }
-
-	  ASSERT (lb_index0 && lb_index1 && lb_index2 && lb_index3);
-	  lb0 = load_balance_get (lb_index0);
-	  lb1 = load_balance_get (lb_index1);
-	  lb2 = load_balance_get (lb_index2);
-	  lb3 = load_balance_get (lb_index3);
-
-	  ASSERT (lb0->lb_n_buckets > 0);
-	  ASSERT (is_pow2 (lb0->lb_n_buckets));
-	  ASSERT (lb1->lb_n_buckets > 0);
-	  ASSERT (is_pow2 (lb1->lb_n_buckets));
-	  ASSERT (lb2->lb_n_buckets > 0);
-	  ASSERT (is_pow2 (lb2->lb_n_buckets));
-	  ASSERT (lb3->lb_n_buckets > 0);
-	  ASSERT (is_pow2 (lb3->lb_n_buckets));
-
-	  /* Use flow hash to compute multipath adjacency. */
-	  hash_c0 = vnet_buffer (p0)->ip.flow_hash = 0;
-	  hash_c1 = vnet_buffer (p1)->ip.flow_hash = 0;
-	  hash_c2 = vnet_buffer (p2)->ip.flow_hash = 0;
-	  hash_c3 = vnet_buffer (p3)->ip.flow_hash = 0;
-	  if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
-	    {
-	      flow_hash_config0 = lb0->lb_hash_config;
-	      hash_c0 = vnet_buffer (p0)->ip.flow_hash =
-		ip4_compute_flow_hash (ip0, flow_hash_config0);
-	      dpo0 =
-		load_balance_get_fwd_bucket (lb0,
-					     (hash_c0 &
-					      (lb0->lb_n_buckets_minus_1)));
-	    }
-	  else
-	    {
-	      dpo0 = load_balance_get_bucket_i (lb0, 0);
-	    }
-	  if (PREDICT_FALSE (lb1->lb_n_buckets > 1))
-	    {
-	      flow_hash_config1 = lb1->lb_hash_config;
-	      hash_c1 = vnet_buffer (p1)->ip.flow_hash =
-		ip4_compute_flow_hash (ip1, flow_hash_config1);
-	      dpo1 =
-		load_balance_get_fwd_bucket (lb1,
-					     (hash_c1 &
-					      (lb1->lb_n_buckets_minus_1)));
-	    }
-	  else
-	    {
-	      dpo1 = load_balance_get_bucket_i (lb1, 0);
-	    }
-	  if (PREDICT_FALSE (lb2->lb_n_buckets > 1))
-	    {
-	      flow_hash_config2 = lb2->lb_hash_config;
-	      hash_c2 = vnet_buffer (p2)->ip.flow_hash =
-		ip4_compute_flow_hash (ip2, flow_hash_config2);
-	      dpo2 =
-		load_balance_get_fwd_bucket (lb2,
-					     (hash_c2 &
-					      (lb2->lb_n_buckets_minus_1)));
-	    }
-	  else
-	    {
-	      dpo2 = load_balance_get_bucket_i (lb2, 0);
-	    }
-	  if (PREDICT_FALSE (lb3->lb_n_buckets > 1))
-	    {
-	      flow_hash_config3 = lb3->lb_hash_config;
-	      hash_c3 = vnet_buffer (p3)->ip.flow_hash =
-		ip4_compute_flow_hash (ip3, flow_hash_config3);
-	      dpo3 =
-		load_balance_get_fwd_bucket (lb3,
-					     (hash_c3 &
-					      (lb3->lb_n_buckets_minus_1)));
-	    }
-	  else
-	    {
-	      dpo3 = load_balance_get_bucket_i (lb3, 0);
-	    }
-
-	  next0 = dpo0->dpoi_next_node;
-	  vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
-	  next1 = dpo1->dpoi_next_node;
-	  vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
-	  next2 = dpo2->dpoi_next_node;
-	  vnet_buffer (p2)->ip.adj_index[VLIB_TX] = dpo2->dpoi_index;
-	  next3 = dpo3->dpoi_next_node;
-	  vnet_buffer (p3)->ip.adj_index[VLIB_TX] = dpo3->dpoi_index;
-
-	  vlib_increment_combined_counter
-	    (cm, thread_index, lb_index0, 1,
-	     vlib_buffer_length_in_chain (vm, p0));
-	  vlib_increment_combined_counter
-	    (cm, thread_index, lb_index1, 1,
-	     vlib_buffer_length_in_chain (vm, p1));
-	  vlib_increment_combined_counter
-	    (cm, thread_index, lb_index2, 1,
-	     vlib_buffer_length_in_chain (vm, p2));
-	  vlib_increment_combined_counter
-	    (cm, thread_index, lb_index3, 1,
-	     vlib_buffer_length_in_chain (vm, p3));
-
-	  vlib_validate_buffer_enqueue_x4 (vm, node, next,
-					   to_next, n_left_to_next,
-					   pi0, pi1, pi2, pi3,
-					   next0, next1, next2, next3);
-	}
-#elif (CLIB_N_PREFETCHES >= 4)
-      while (n_left_from >= 4 && n_left_to_next >= 2)
-	{
-	  vlib_buffer_t *p0, *p1;
-	  ip4_header_t *ip0, *ip1;
-	  ip_lookup_next_t next0, next1;
-	  const load_balance_t *lb0, *lb1;
-	  ip4_fib_mtrie_t *mtrie0, *mtrie1;
-	  ip4_fib_mtrie_leaf_t leaf0, leaf1;
-	  ip4_address_t *dst_addr0, *dst_addr1;
-	  u32 pi0, pi1, lb_index0, lb_index1;
-	  flow_hash_config_t flow_hash_config0, flow_hash_config1;
-	  u32 hash_c0, hash_c1;
-	  const dpo_id_t *dpo0, *dpo1;
-
-	  /* Prefetch next iteration. */
-	  {
-	    vlib_prefetch_buffer_header (b[2], LOAD);
-	    vlib_prefetch_buffer_header (b[3], LOAD);
-
-	    CLIB_PREFETCH (b[2]->data, sizeof (ip0[0]), LOAD);
-	    CLIB_PREFETCH (b[3]->data, sizeof (ip0[0]), LOAD);
-	  }
-
-	  pi0 = to_next[0] = from[0];
-	  pi1 = to_next[1] = from[1];
-
-	  from += 2;
-	  to_next += 2;
-	  n_left_to_next -= 2;
-	  n_left_from -= 2;
-
-	  p0 = b[0];
-	  p1 = b[1];
-	  b += 2;
-
-	  ip0 = vlib_buffer_get_current (p0);
-	  ip1 = vlib_buffer_get_current (p1);
-
-	  dst_addr0 = &ip0->dst_address;
-	  dst_addr1 = &ip1->dst_address;
-
-	  ip_lookup_set_buffer_fib_index (im->fib_index_by_sw_if_index, p0);
-	  ip_lookup_set_buffer_fib_index (im->fib_index_by_sw_if_index, p1);
-
-	  if (!lookup_for_responses_to_locally_received_packets)
-	    {
-	      mtrie0 = &ip4_fib_get (vnet_buffer (p0)->ip.fib_index)->mtrie;
-	      mtrie1 = &ip4_fib_get (vnet_buffer (p1)->ip.fib_index)->mtrie;
-
-	      leaf0 = ip4_fib_mtrie_lookup_step_one (mtrie0, dst_addr0);
-	      leaf1 = ip4_fib_mtrie_lookup_step_one (mtrie1, dst_addr1);
-	    }
-
-	  if (!lookup_for_responses_to_locally_received_packets)
-	    {
-	      leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 2);
-	      leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 2);
-	    }
-
-	  if (!lookup_for_responses_to_locally_received_packets)
-	    {
-	      leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 3);
-	      leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 3);
-	    }
-
-	  if (lookup_for_responses_to_locally_received_packets)
-	    {
-	      lb_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_RX];
-	      lb_index1 = vnet_buffer (p1)->ip.adj_index[VLIB_RX];
-	    }
-	  else
-	    {
-	      lb_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
-	      lb_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
-	    }
-
-	  ASSERT (lb_index0 && lb_index1);
-	  lb0 = load_balance_get (lb_index0);
-	  lb1 = load_balance_get (lb_index1);
-
-	  ASSERT (lb0->lb_n_buckets > 0);
-	  ASSERT (is_pow2 (lb0->lb_n_buckets));
-	  ASSERT (lb1->lb_n_buckets > 0);
-	  ASSERT (is_pow2 (lb1->lb_n_buckets));
-
-	  /* Use flow hash to compute multipath adjacency. */
-	  hash_c0 = vnet_buffer (p0)->ip.flow_hash = 0;
-	  hash_c1 = vnet_buffer (p1)->ip.flow_hash = 0;
-	  if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
-	    {
-	      flow_hash_config0 = lb0->lb_hash_config;
-	      hash_c0 = vnet_buffer (p0)->ip.flow_hash =
-		ip4_compute_flow_hash (ip0, flow_hash_config0);
-	      dpo0 =
-		load_balance_get_fwd_bucket (lb0,
-					     (hash_c0 &
-					      (lb0->lb_n_buckets_minus_1)));
-	    }
-	  else
-	    {
-	      dpo0 = load_balance_get_bucket_i (lb0, 0);
-	    }
-	  if (PREDICT_FALSE (lb1->lb_n_buckets > 1))
-	    {
-	      flow_hash_config1 = lb1->lb_hash_config;
-	      hash_c1 = vnet_buffer (p1)->ip.flow_hash =
-		ip4_compute_flow_hash (ip1, flow_hash_config1);
-	      dpo1 =
-		load_balance_get_fwd_bucket (lb1,
-					     (hash_c1 &
-					      (lb1->lb_n_buckets_minus_1)));
-	    }
-	  else
-	    {
-	      dpo1 = load_balance_get_bucket_i (lb1, 0);
-	    }
-
-	  next0 = dpo0->dpoi_next_node;
-	  vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
-	  next1 = dpo1->dpoi_next_node;
-	  vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
-
-	  vlib_increment_combined_counter
-	    (cm, thread_index, lb_index0, 1,
-	     vlib_buffer_length_in_chain (vm, p0));
-	  vlib_increment_combined_counter
-	    (cm, thread_index, lb_index1, 1,
-	     vlib_buffer_length_in_chain (vm, p1));
-
-	  vlib_validate_buffer_enqueue_x2 (vm, node, next,
-					   to_next, n_left_to_next,
-					   pi0, pi1, next0, next1);
-	}
-#endif
-      while (n_left_from > 0 && n_left_to_next > 0)
-	{
-	  vlib_buffer_t *p0;
-	  ip4_header_t *ip0;
-	  ip_lookup_next_t next0;
-	  const load_balance_t *lb0;
-	  ip4_fib_mtrie_t *mtrie0;
-	  ip4_fib_mtrie_leaf_t leaf0;
-	  ip4_address_t *dst_addr0;
-	  u32 pi0, lbi0;
-	  flow_hash_config_t flow_hash_config0;
-	  const dpo_id_t *dpo0;
-	  u32 hash_c0;
-
-	  pi0 = from[0];
-	  to_next[0] = pi0;
-
-	  p0 = b[0];
-	  b += 1;
-
-	  ip0 = vlib_buffer_get_current (p0);
-	  dst_addr0 = &ip0->dst_address;
-	  ip_lookup_set_buffer_fib_index (im->fib_index_by_sw_if_index, p0);
-
-	  if (!lookup_for_responses_to_locally_received_packets)
-	    {
-	      mtrie0 = &ip4_fib_get (vnet_buffer (p0)->ip.fib_index)->mtrie;
-	      leaf0 = ip4_fib_mtrie_lookup_step_one (mtrie0, dst_addr0);
-	    }
-
-	  if (!lookup_for_responses_to_locally_received_packets)
-	    leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 2);
-
-	  if (!lookup_for_responses_to_locally_received_packets)
-	    leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 3);
-
-	  if (lookup_for_responses_to_locally_received_packets)
-	    lbi0 = vnet_buffer (p0)->ip.adj_index[VLIB_RX];
-	  else
-	    {
-	      /* Handle default route. */
-	      lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
-	    }
-
-	  ASSERT (lbi0);
-	  lb0 = load_balance_get (lbi0);
-
-	  ASSERT (lb0->lb_n_buckets > 0);
-	  ASSERT (is_pow2 (lb0->lb_n_buckets));
-
-	  /* Use flow hash to compute multipath adjacency. */
-	  hash_c0 = vnet_buffer (p0)->ip.flow_hash = 0;
-	  if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
-	    {
-	      flow_hash_config0 = lb0->lb_hash_config;
-
-	      hash_c0 = vnet_buffer (p0)->ip.flow_hash =
-		ip4_compute_flow_hash (ip0, flow_hash_config0);
-	      dpo0 =
-		load_balance_get_fwd_bucket (lb0,
-					     (hash_c0 &
-					      (lb0->lb_n_buckets_minus_1)));
-	    }
-	  else
-	    {
-	      dpo0 = load_balance_get_bucket_i (lb0, 0);
-	    }
-
-	  next0 = dpo0->dpoi_next_node;
-	  vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
-
-	  vlib_increment_combined_counter (cm, thread_index, lbi0, 1,
-					   vlib_buffer_length_in_chain (vm,
-									p0));
-
-	  from += 1;
-	  to_next += 1;
-	  n_left_to_next -= 1;
-	  n_left_from -= 1;
-
-	  if (PREDICT_FALSE (next0 != next))
-	    {
-	      n_left_to_next += 1;
-	      vlib_put_next_frame (vm, node, next, n_left_to_next);
-	      next = next0;
-	      vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
-	      to_next[0] = pi0;
-	      to_next += 1;
-	      n_left_to_next -= 1;
-	    }
+	  CLIB_PREFETCH (b[4]->data, sizeof (ip0[0]), LOAD);
+	  CLIB_PREFETCH (b[5]->data, sizeof (ip0[0]), LOAD);
+	  CLIB_PREFETCH (b[6]->data, sizeof (ip0[0]), LOAD);
+	  CLIB_PREFETCH (b[7]->data, sizeof (ip0[0]), LOAD);
 	}
 
-      vlib_put_next_frame (vm, node, next, n_left_to_next);
+      ip0 = vlib_buffer_get_current (b[0]);
+      ip1 = vlib_buffer_get_current (b[1]);
+      ip2 = vlib_buffer_get_current (b[2]);
+      ip3 = vlib_buffer_get_current (b[3]);
+
+      dst_addr0 = &ip0->dst_address;
+      dst_addr1 = &ip1->dst_address;
+      dst_addr2 = &ip2->dst_address;
+      dst_addr3 = &ip3->dst_address;
+
+      ip_lookup_set_buffer_fib_index (im->fib_index_by_sw_if_index, b[0]);
+      ip_lookup_set_buffer_fib_index (im->fib_index_by_sw_if_index, b[1]);
+      ip_lookup_set_buffer_fib_index (im->fib_index_by_sw_if_index, b[2]);
+      ip_lookup_set_buffer_fib_index (im->fib_index_by_sw_if_index, b[3]);
+
+      if (!lookup_for_responses_to_locally_received_packets)
+	{
+	  mtrie0 = &ip4_fib_get (vnet_buffer (b[0])->ip.fib_index)->mtrie;
+	  mtrie1 = &ip4_fib_get (vnet_buffer (b[1])->ip.fib_index)->mtrie;
+	  mtrie2 = &ip4_fib_get (vnet_buffer (b[2])->ip.fib_index)->mtrie;
+	  mtrie3 = &ip4_fib_get (vnet_buffer (b[3])->ip.fib_index)->mtrie;
+
+	  leaf0 = ip4_fib_mtrie_lookup_step_one (mtrie0, dst_addr0);
+	  leaf1 = ip4_fib_mtrie_lookup_step_one (mtrie1, dst_addr1);
+	  leaf2 = ip4_fib_mtrie_lookup_step_one (mtrie2, dst_addr2);
+	  leaf3 = ip4_fib_mtrie_lookup_step_one (mtrie3, dst_addr3);
+	}
+
+      if (!lookup_for_responses_to_locally_received_packets)
+	{
+	  leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 2);
+	  leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 2);
+	  leaf2 = ip4_fib_mtrie_lookup_step (mtrie2, leaf2, dst_addr2, 2);
+	  leaf3 = ip4_fib_mtrie_lookup_step (mtrie3, leaf3, dst_addr3, 2);
+	}
+
+      if (!lookup_for_responses_to_locally_received_packets)
+	{
+	  leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 3);
+	  leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 3);
+	  leaf2 = ip4_fib_mtrie_lookup_step (mtrie2, leaf2, dst_addr2, 3);
+	  leaf3 = ip4_fib_mtrie_lookup_step (mtrie3, leaf3, dst_addr3, 3);
+	}
+
+      if (lookup_for_responses_to_locally_received_packets)
+	{
+	  lb_index0 = vnet_buffer (b[0])->ip.adj_index[VLIB_RX];
+	  lb_index1 = vnet_buffer (b[1])->ip.adj_index[VLIB_RX];
+	  lb_index2 = vnet_buffer (b[2])->ip.adj_index[VLIB_RX];
+	  lb_index3 = vnet_buffer (b[3])->ip.adj_index[VLIB_RX];
+	}
+      else
+	{
+	  lb_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
+	  lb_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
+	  lb_index2 = ip4_fib_mtrie_leaf_get_adj_index (leaf2);
+	  lb_index3 = ip4_fib_mtrie_leaf_get_adj_index (leaf3);
+	}
+
+      ASSERT (lb_index0 && lb_index1 && lb_index2 && lb_index3);
+      lb0 = load_balance_get (lb_index0);
+      lb1 = load_balance_get (lb_index1);
+      lb2 = load_balance_get (lb_index2);
+      lb3 = load_balance_get (lb_index3);
+
+      ASSERT (lb0->lb_n_buckets > 0);
+      ASSERT (is_pow2 (lb0->lb_n_buckets));
+      ASSERT (lb1->lb_n_buckets > 0);
+      ASSERT (is_pow2 (lb1->lb_n_buckets));
+      ASSERT (lb2->lb_n_buckets > 0);
+      ASSERT (is_pow2 (lb2->lb_n_buckets));
+      ASSERT (lb3->lb_n_buckets > 0);
+      ASSERT (is_pow2 (lb3->lb_n_buckets));
+
+      /* Use flow hash to compute multipath adjacency. */
+      hash_c0 = vnet_buffer (b[0])->ip.flow_hash = 0;
+      hash_c1 = vnet_buffer (b[1])->ip.flow_hash = 0;
+      hash_c2 = vnet_buffer (b[2])->ip.flow_hash = 0;
+      hash_c3 = vnet_buffer (b[3])->ip.flow_hash = 0;
+      if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
+	{
+	  flow_hash_config0 = lb0->lb_hash_config;
+	  hash_c0 = vnet_buffer (b[0])->ip.flow_hash =
+	    ip4_compute_flow_hash (ip0, flow_hash_config0);
+	  dpo0 =
+	    load_balance_get_fwd_bucket (lb0,
+					 (hash_c0 &
+					  (lb0->lb_n_buckets_minus_1)));
+	}
+      else
+	{
+	  dpo0 = load_balance_get_bucket_i (lb0, 0);
+	}
+      if (PREDICT_FALSE (lb1->lb_n_buckets > 1))
+	{
+	  flow_hash_config1 = lb1->lb_hash_config;
+	  hash_c1 = vnet_buffer (b[1])->ip.flow_hash =
+	    ip4_compute_flow_hash (ip1, flow_hash_config1);
+	  dpo1 =
+	    load_balance_get_fwd_bucket (lb1,
+					 (hash_c1 &
+					  (lb1->lb_n_buckets_minus_1)));
+	}
+      else
+	{
+	  dpo1 = load_balance_get_bucket_i (lb1, 0);
+	}
+      if (PREDICT_FALSE (lb2->lb_n_buckets > 1))
+	{
+	  flow_hash_config2 = lb2->lb_hash_config;
+	  hash_c2 = vnet_buffer (b[2])->ip.flow_hash =
+	    ip4_compute_flow_hash (ip2, flow_hash_config2);
+	  dpo2 =
+	    load_balance_get_fwd_bucket (lb2,
+					 (hash_c2 &
+					  (lb2->lb_n_buckets_minus_1)));
+	}
+      else
+	{
+	  dpo2 = load_balance_get_bucket_i (lb2, 0);
+	}
+      if (PREDICT_FALSE (lb3->lb_n_buckets > 1))
+	{
+	  flow_hash_config3 = lb3->lb_hash_config;
+	  hash_c3 = vnet_buffer (b[3])->ip.flow_hash =
+	    ip4_compute_flow_hash (ip3, flow_hash_config3);
+	  dpo3 =
+	    load_balance_get_fwd_bucket (lb3,
+					 (hash_c3 &
+					  (lb3->lb_n_buckets_minus_1)));
+	}
+      else
+	{
+	  dpo3 = load_balance_get_bucket_i (lb3, 0);
+	}
+
+      next[0] = dpo0->dpoi_next_node;
+      vnet_buffer (b[0])->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
+      next[1] = dpo1->dpoi_next_node;
+      vnet_buffer (b[1])->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
+      next[2] = dpo2->dpoi_next_node;
+      vnet_buffer (b[2])->ip.adj_index[VLIB_TX] = dpo2->dpoi_index;
+      next[3] = dpo3->dpoi_next_node;
+      vnet_buffer (b[3])->ip.adj_index[VLIB_TX] = dpo3->dpoi_index;
+
+      vlib_increment_combined_counter
+	(cm, thread_index, lb_index0, 1,
+	 vlib_buffer_length_in_chain (vm, b[0]));
+      vlib_increment_combined_counter
+	(cm, thread_index, lb_index1, 1,
+	 vlib_buffer_length_in_chain (vm, b[1]));
+      vlib_increment_combined_counter
+	(cm, thread_index, lb_index2, 1,
+	 vlib_buffer_length_in_chain (vm, b[2]));
+      vlib_increment_combined_counter
+	(cm, thread_index, lb_index3, 1,
+	 vlib_buffer_length_in_chain (vm, b[3]));
+
+      b += 4;
+      next += 4;
+      n_left -= 4;
     }
+#elif (CLIB_N_PREFETCHES >= 4)
+  while (n_left >= 4)
+    {
+      ip4_header_t *ip0, *ip1;
+      const load_balance_t *lb0, *lb1;
+      ip4_fib_mtrie_t *mtrie0, *mtrie1;
+      ip4_fib_mtrie_leaf_t leaf0, leaf1;
+      ip4_address_t *dst_addr0, *dst_addr1;
+      u32 lb_index0, lb_index1;
+      flow_hash_config_t flow_hash_config0, flow_hash_config1;
+      u32 hash_c0, hash_c1;
+      const dpo_id_t *dpo0, *dpo1;
+
+      /* Prefetch next iteration. */
+      {
+	vlib_prefetch_buffer_header (b[2], LOAD);
+	vlib_prefetch_buffer_header (b[3], LOAD);
+
+	CLIB_PREFETCH (b[2]->data, sizeof (ip0[0]), LOAD);
+	CLIB_PREFETCH (b[3]->data, sizeof (ip0[0]), LOAD);
+      }
+
+      ip0 = vlib_buffer_get_current (b[0]);
+      ip1 = vlib_buffer_get_current (b[1]);
+
+      dst_addr0 = &ip0->dst_address;
+      dst_addr1 = &ip1->dst_address;
+
+      ip_lookup_set_buffer_fib_index (im->fib_index_by_sw_if_index, b[0]);
+      ip_lookup_set_buffer_fib_index (im->fib_index_by_sw_if_index, b[1]);
+
+      if (!lookup_for_responses_to_locally_received_packets)
+	{
+	  mtrie0 = &ip4_fib_get (vnet_buffer (b[0])->ip.fib_index)->mtrie;
+	  mtrie1 = &ip4_fib_get (vnet_buffer (b[1])->ip.fib_index)->mtrie;
+
+	  leaf0 = ip4_fib_mtrie_lookup_step_one (mtrie0, dst_addr0);
+	  leaf1 = ip4_fib_mtrie_lookup_step_one (mtrie1, dst_addr1);
+	}
+
+      if (!lookup_for_responses_to_locally_received_packets)
+	{
+	  leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 2);
+	  leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 2);
+	}
+
+      if (!lookup_for_responses_to_locally_received_packets)
+	{
+	  leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 3);
+	  leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 3);
+	}
+
+      if (lookup_for_responses_to_locally_received_packets)
+	{
+	  lb_index0 = vnet_buffer (b[0])->ip.adj_index[VLIB_RX];
+	  lb_index1 = vnet_buffer (b[1])->ip.adj_index[VLIB_RX];
+	}
+      else
+	{
+	  lb_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
+	  lb_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
+	}
+
+      ASSERT (lb_index0 && lb_index1);
+      lb0 = load_balance_get (lb_index0);
+      lb1 = load_balance_get (lb_index1);
+
+      ASSERT (lb0->lb_n_buckets > 0);
+      ASSERT (is_pow2 (lb0->lb_n_buckets));
+      ASSERT (lb1->lb_n_buckets > 0);
+      ASSERT (is_pow2 (lb1->lb_n_buckets));
+
+      /* Use flow hash to compute multipath adjacency. */
+      hash_c0 = vnet_buffer (b[0])->ip.flow_hash = 0;
+      hash_c1 = vnet_buffer (b[1])->ip.flow_hash = 0;
+      if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
+	{
+	  flow_hash_config0 = lb0->lb_hash_config;
+	  hash_c0 = vnet_buffer (b[0])->ip.flow_hash =
+	    ip4_compute_flow_hash (ip0, flow_hash_config0);
+	  dpo0 =
+	    load_balance_get_fwd_bucket (lb0,
+					 (hash_c0 &
+					  (lb0->lb_n_buckets_minus_1)));
+	}
+      else
+	{
+	  dpo0 = load_balance_get_bucket_i (lb0, 0);
+	}
+      if (PREDICT_FALSE (lb1->lb_n_buckets > 1))
+	{
+	  flow_hash_config1 = lb1->lb_hash_config;
+	  hash_c1 = vnet_buffer (b[1])->ip.flow_hash =
+	    ip4_compute_flow_hash (ip1, flow_hash_config1);
+	  dpo1 =
+	    load_balance_get_fwd_bucket (lb1,
+					 (hash_c1 &
+					  (lb1->lb_n_buckets_minus_1)));
+	}
+      else
+	{
+	  dpo1 = load_balance_get_bucket_i (lb1, 0);
+	}
+
+      next[0] = dpo0->dpoi_next_node;
+      vnet_buffer (b[0])->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
+      next[1] = dpo1->dpoi_next_node;
+      vnet_buffer (b[1])->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
+
+      vlib_increment_combined_counter
+	(cm, thread_index, lb_index0, 1,
+	 vlib_buffer_length_in_chain (vm, b[0]));
+      vlib_increment_combined_counter
+	(cm, thread_index, lb_index1, 1,
+	 vlib_buffer_length_in_chain (vm, b[1]));
+
+      b += 2;
+      next += 2;
+      n_left -= 2;
+    }
+#endif
+  while (n_left > 0)
+    {
+      ip4_header_t *ip0;
+      const load_balance_t *lb0;
+      ip4_fib_mtrie_t *mtrie0;
+      ip4_fib_mtrie_leaf_t leaf0;
+      ip4_address_t *dst_addr0;
+      u32 lbi0;
+      flow_hash_config_t flow_hash_config0;
+      const dpo_id_t *dpo0;
+      u32 hash_c0;
+
+      ip0 = vlib_buffer_get_current (b[0]);
+      dst_addr0 = &ip0->dst_address;
+      ip_lookup_set_buffer_fib_index (im->fib_index_by_sw_if_index, b[0]);
+
+      if (!lookup_for_responses_to_locally_received_packets)
+	{
+	  mtrie0 = &ip4_fib_get (vnet_buffer (b[0])->ip.fib_index)->mtrie;
+	  leaf0 = ip4_fib_mtrie_lookup_step_one (mtrie0, dst_addr0);
+	}
+
+      if (!lookup_for_responses_to_locally_received_packets)
+	leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 2);
+
+      if (!lookup_for_responses_to_locally_received_packets)
+	leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 3);
+
+      if (lookup_for_responses_to_locally_received_packets)
+	lbi0 = vnet_buffer (b[0])->ip.adj_index[VLIB_RX];
+      else
+	{
+	  /* Handle default route. */
+	  lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
+	}
+
+      ASSERT (lbi0);
+      lb0 = load_balance_get (lbi0);
+
+      ASSERT (lb0->lb_n_buckets > 0);
+      ASSERT (is_pow2 (lb0->lb_n_buckets));
+
+      /* Use flow hash to compute multipath adjacency. */
+      hash_c0 = vnet_buffer (b[0])->ip.flow_hash = 0;
+      if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
+	{
+	  flow_hash_config0 = lb0->lb_hash_config;
+
+	  hash_c0 = vnet_buffer (b[0])->ip.flow_hash =
+	    ip4_compute_flow_hash (ip0, flow_hash_config0);
+	  dpo0 =
+	    load_balance_get_fwd_bucket (lb0,
+					 (hash_c0 &
+					  (lb0->lb_n_buckets_minus_1)));
+	}
+      else
+	{
+	  dpo0 = load_balance_get_bucket_i (lb0, 0);
+	}
+
+      next[0] = dpo0->dpoi_next_node;
+      vnet_buffer (b[0])->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
+
+      vlib_increment_combined_counter (cm, thread_index, lbi0, 1,
+				       vlib_buffer_length_in_chain (vm,
+								    b[0]));
+
+      b += 1;
+      next += 1;
+      n_left -= 1;
+    }
+
+  vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
 
   if (node->flags & VLIB_NODE_FLAG_TRACE)
     ip4_forward_next_trace (vm, node, frame, VLIB_TX);