bond: tx perf improvement, part trois

Introduce bond_tx_inline which takes lb as a constant for gcc to do the optimization

The number appears a tad better for 256 bytes frame.

with the patch
--------------
Thread 2 vpp_wk_1 (lcore 3)
Time 4.3, average vectors/node 224.00, last 128 main loops 40.00 per node 222.61
  vector rates in 8.4836e6, out 1.6967e7, drop 0.0000e0, punt 0.0000e0
             Name                 State         Calls          Vectors        Suspends         Clocks       Vectors/Call
BondEthernet0-output             active             141054        36109824               0          2.51e1          256.00
BondEthernet0-tx                 active             141054        36109824               0          2.55e1          256.00
TenGigabitEthernet6/0/0-output   active             141054        18055469               0          9.43e0          128.00
TenGigabitEthernet6/0/0-tx       active             141054        18055469               0          6.97e1          128.00
TenGigabitEthernet6/0/1-output   active             141054        18054355               0          9.54e0          127.99
TenGigabitEthernet6/0/1-tx       active             141054        18054355               0          7.05e1          127.99
bond-input                       active             141054        36109824               0          1.76e1          256.00
dpdk-input                       polling             70527        36109824               0          5.03e1          512.00
ethernet-input                   active             141054        36109824               0          6.12e1          256.00
ip4-input                        active             141054        36109824               0          3.26e1          256.00
ip4-lookup                       active             141054        36109824               0          2.94e1          256.00
ip4-rewrite                      active             141054        36109824               0          3.27e1          256.00

without the patch
-----------------
Thread 2 vpp_wk_1 (lcore 3)
Time 4.3, average vectors/node 224.00, last 128 main loops 40.00 per node 222.61
  vector rates in 8.4443e6, out 1.6889e7, drop 0.0000e0, punt 0.0000e0
             Name                 State         Calls          Vectors        Suspends         Clocks       Vectors/Call
BondEthernet0-output             active             142744        36542464               0          2.51e1          256.00
BondEthernet0-tx                 active             142744        36542464               0          2.67e1          256.00
TenGigabitEthernet6/0/0-output   active             142744        18270813               0          9.19e0          127.99
TenGigabitEthernet6/0/0-tx       active             142744        18270813               0          6.98e1          127.99
TenGigabitEthernet6/0/1-output   active             142744        18271651               0          9.43e0          128.00
TenGigabitEthernet6/0/1-tx       active             142744        18271651               0          7.02e1          128.00
bond-input                       active             142744        36542464               0          1.76e1          256.00
dpdk-input                       polling             71372        36542464               0          5.08e1          512.00
ethernet-input                   active             142744        36542464               0          6.15e1          256.00
ip4-input                        active             142744        36542464               0          3.23e1          256.00
ip4-lookup                       active             142744        36542464               0          2.96e1          256.00
ip4-rewrite                      active             142744        36542464               0          3.28e1          256.00

Change-Id: I9fd43eda3c735cbff680ac6d2f01ecdae81f0eda
Signed-off-by: Damjan Marion <damarion@cisco.com>
diff --git a/src/vnet/bonding/device.c b/src/vnet/bonding/device.c
index 79ca2fa..8a78728 100644
--- a/src/vnet/bonding/device.c
+++ b/src/vnet/bonding/device.c
@@ -379,63 +379,28 @@
   return 0;
 }
 
-static bond_load_balance_func_t bond_load_balance_table[] = {
-#define _(v,f,s, p) { bond_load_balance_##p },
-  foreach_bond_lb_algo
-#undef _
-};
-
-VNET_DEVICE_CLASS_TX_FN (bond_dev_class) (vlib_main_t * vm,
-					  vlib_node_runtime_t * node,
-					  vlib_frame_t * frame)
+static_always_inline void
+bond_tx_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
+		vlib_frame_t * frame, bond_if_t * bif,
+		uword slave_count, u32 lb_alg)
 {
-  vnet_interface_output_runtime_t *rund = (void *) node->runtime_data;
   bond_main_t *bm = &bond_main;
-  bond_if_t *bif = pool_elt_at_index (bm->interfaces, rund->dev_instance);
-  vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
-  u32 *from = vlib_frame_vector_args (frame);
-  ethernet_header_t *eth;
-  u32 n_left;
-  u32 sw_if_index;
+  vnet_main_t *vnm = vnet_get_main ();
+  u16 thread_index = vm->thread_index;
   bond_packet_trace_t *t0;
   uword n_trace = vlib_get_trace_count (vm, node);
-  u16 thread_index = vm->thread_index;
-  vnet_main_t *vnm = vnet_get_main ();
   u32 *to_next;
   vlib_frame_t *f;
-  uword slave_count;
+  ethernet_header_t *eth;
+  vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
+  u32 *from = vlib_frame_vector_args (frame);
+  u32 n_left = frame->n_vectors;
+  u32 sw_if_index;
   u32 port0 = 0, port1 = 0, port2 = 0, port3 = 0;
   bond_per_thread_data_t *ptd = vec_elt_at_index (bm->per_thread_data,
 						  thread_index);
 
-  if (PREDICT_FALSE (bif->admin_up == 0))
-    {
-      vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors);
-      vlib_increment_simple_counter (vnet_main.interface_main.sw_if_counters +
-				     VNET_INTERFACE_COUNTER_DROP,
-				     thread_index, bif->sw_if_index,
-				     frame->n_vectors);
-      vlib_error_count (vm, node->node_index, BOND_TX_ERROR_IF_DOWN,
-			frame->n_vectors);
-      return frame->n_vectors;
-    }
-
-  n_left = frame->n_vectors;
   vlib_get_buffers (vm, from, bufs, n_left);
-
-  slave_count = vec_len (bif->active_slaves);
-  if (PREDICT_FALSE (slave_count == 0))
-    {
-      vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors);
-      vlib_increment_simple_counter (vnet_main.interface_main.sw_if_counters +
-				     VNET_INTERFACE_COUNTER_DROP,
-				     thread_index, bif->sw_if_index,
-				     frame->n_vectors);
-      vlib_error_count (vm, node->node_index, BOND_TX_ERROR_NO_SLAVE,
-			frame->n_vectors);
-      return frame->n_vectors;
-    }
-
   b = bufs;
   while (n_left >= 4)
     {
@@ -464,22 +429,72 @@
 
       if (PREDICT_TRUE (slave_count > 1))
 	{
-	  port0 =
-	    (bond_load_balance_table[bif->lb]).load_balance (vm, node,
-							     bif, b[0],
-							     slave_count);
-	  port1 =
-	    (bond_load_balance_table[bif->lb]).load_balance (vm, node,
-							     bif, b[1],
-							     slave_count);
-	  port2 =
-	    (bond_load_balance_table[bif->lb]).load_balance (vm, node,
-							     bif, b[2],
-							     slave_count);
-	  port3 =
-	    (bond_load_balance_table[bif->lb]).load_balance (vm, node,
-							     bif, b[3],
-							     slave_count);
+	  if (lb_alg == BOND_LB_L2)
+	    {
+	      port0 = bond_load_balance_l2 (vm, node, bif, b[0], slave_count);
+	      port1 = bond_load_balance_l2 (vm, node, bif, b[1], slave_count);
+	      port2 = bond_load_balance_l2 (vm, node, bif, b[2], slave_count);
+	      port3 = bond_load_balance_l2 (vm, node, bif, b[3], slave_count);
+	    }
+	  else if (lb_alg == BOND_LB_L34)
+	    {
+	      port0 = bond_load_balance_l34 (vm, node, bif, b[0],
+					     slave_count);
+	      port1 = bond_load_balance_l34 (vm, node, bif, b[1],
+					     slave_count);
+	      port2 = bond_load_balance_l34 (vm, node, bif, b[2],
+					     slave_count);
+	      port3 = bond_load_balance_l34 (vm, node, bif, b[3],
+					     slave_count);
+	    }
+	  else if (lb_alg == BOND_LB_L23)
+	    {
+	      port0 = bond_load_balance_l23 (vm, node, bif, b[0],
+					     slave_count);
+	      port1 = bond_load_balance_l23 (vm, node, bif, b[1],
+					     slave_count);
+	      port2 = bond_load_balance_l23 (vm, node, bif, b[2],
+					     slave_count);
+	      port3 = bond_load_balance_l23 (vm, node, bif, b[3],
+					     slave_count);
+	    }
+	  else if (lb_alg == BOND_LB_RR)
+	    {
+	      port0 = bond_load_balance_round_robin (vm, node, bif, b[0],
+						     slave_count);
+	      port1 = bond_load_balance_round_robin (vm, node, bif, b[1],
+						     slave_count);
+	      port2 = bond_load_balance_round_robin (vm, node, bif, b[2],
+						     slave_count);
+	      port3 = bond_load_balance_round_robin (vm, node, bif, b[3],
+						     slave_count);
+	    }
+	  else if (lb_alg == BOND_LB_BC)
+	    {
+	      port0 = bond_load_balance_broadcast (vm, node, bif, b[0],
+						   slave_count);
+	      port1 = bond_load_balance_broadcast (vm, node, bif, b[1],
+						   slave_count);
+	      port2 = bond_load_balance_broadcast (vm, node, bif, b[2],
+						   slave_count);
+	      port3 = bond_load_balance_broadcast (vm, node, bif, b[3],
+						   slave_count);
+	    }
+	  else if (lb_alg == BOND_LB_AB)
+	    {
+	      port0 = bond_load_balance_active_backup (vm, node, bif, b[0],
+						       slave_count);
+	      port1 = bond_load_balance_active_backup (vm, node, bif, b[1],
+						       slave_count);
+	      port2 = bond_load_balance_active_backup (vm, node, bif, b[2],
+						       slave_count);
+	      port3 = bond_load_balance_active_backup (vm, node, bif, b[3],
+						       slave_count);
+	    }
+	  else
+	    {
+	      ASSERT (0);
+	    }
 	}
 
       sif_if_index0 = *vec_elt_at_index (bif->active_slaves, port0);
@@ -574,9 +589,42 @@
       VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[0]);
 
       if (PREDICT_TRUE (slave_count > 1))
-	port0 =
-	  (bond_load_balance_table[bif->lb]).load_balance (vm, node, bif,
-							   b[0], slave_count);
+	{
+	  if (bif->lb == BOND_LB_L2)
+	    {
+	      port0 = bond_load_balance_l2 (vm, node, bif, b[0], slave_count);
+	    }
+	  else if (bif->lb == BOND_LB_L34)
+	    {
+	      port0 = bond_load_balance_l34 (vm, node, bif, b[0],
+					     slave_count);
+	    }
+	  else if (bif->lb == BOND_LB_L23)
+	    {
+	      port0 = bond_load_balance_l23 (vm, node, bif, b[0],
+					     slave_count);
+	    }
+	  else if (bif->lb == BOND_LB_RR)
+	    {
+	      port0 = bond_load_balance_round_robin (vm, node, bif, b[0],
+						     slave_count);
+	    }
+	  else if (bif->lb == BOND_LB_BC)
+	    {
+	      port0 = bond_load_balance_broadcast (vm, node, bif, b[0],
+						   slave_count);
+	    }
+	  else if (bif->lb == BOND_LB_AB)
+	    {
+	      port0 = bond_load_balance_active_backup (vm, node, bif, b[0],
+						       slave_count);
+	    }
+	  else
+	    {
+	      ASSERT (0);
+	    }
+	}
+
       sif_if_index0 = *vec_elt_at_index (bif->active_slaves, port0);
 
       /* Do the tracing before the old interface is overwritten */
@@ -622,6 +670,57 @@
   vlib_increment_simple_counter (vnet_main.interface_main.sw_if_counters
 				 + VNET_INTERFACE_COUNTER_TX, thread_index,
 				 bif->sw_if_index, frame->n_vectors);
+}
+
+VNET_DEVICE_CLASS_TX_FN (bond_dev_class) (vlib_main_t * vm,
+					  vlib_node_runtime_t * node,
+					  vlib_frame_t * frame)
+{
+  vnet_interface_output_runtime_t *rund = (void *) node->runtime_data;
+  bond_main_t *bm = &bond_main;
+  u16 thread_index = vm->thread_index;
+  bond_if_t *bif = pool_elt_at_index (bm->interfaces, rund->dev_instance);
+  uword slave_count;
+
+  if (PREDICT_FALSE (bif->admin_up == 0))
+    {
+      vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors);
+      vlib_increment_simple_counter (vnet_main.interface_main.sw_if_counters +
+				     VNET_INTERFACE_COUNTER_DROP,
+				     thread_index, bif->sw_if_index,
+				     frame->n_vectors);
+      vlib_error_count (vm, node->node_index, BOND_TX_ERROR_IF_DOWN,
+			frame->n_vectors);
+      return frame->n_vectors;
+    }
+
+  slave_count = vec_len (bif->active_slaves);
+  if (PREDICT_FALSE (slave_count == 0))
+    {
+      vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors);
+      vlib_increment_simple_counter (vnet_main.interface_main.sw_if_counters +
+				     VNET_INTERFACE_COUNTER_DROP,
+				     thread_index, bif->sw_if_index,
+				     frame->n_vectors);
+      vlib_error_count (vm, node->node_index, BOND_TX_ERROR_NO_SLAVE,
+			frame->n_vectors);
+      return frame->n_vectors;
+    }
+
+  if (bif->lb == BOND_LB_L2)
+    bond_tx_inline (vm, node, frame, bif, slave_count, BOND_LB_L2);
+  else if (bif->lb == BOND_LB_L34)
+    bond_tx_inline (vm, node, frame, bif, slave_count, BOND_LB_L34);
+  else if (bif->lb == BOND_LB_L23)
+    bond_tx_inline (vm, node, frame, bif, slave_count, BOND_LB_L23);
+  else if (bif->lb == BOND_LB_RR)
+    bond_tx_inline (vm, node, frame, bif, slave_count, BOND_LB_RR);
+  else if (bif->lb == BOND_LB_BC)
+    bond_tx_inline (vm, node, frame, bif, slave_count, BOND_LB_BC);
+  else if (bif->lb == BOND_LB_AB)
+    bond_tx_inline (vm, node, frame, bif, slave_count, BOND_LB_AB);
+  else
+    ASSERT (0);
 
   return frame->n_vectors;
 }