bonding: add support for numa awareness

This patch enables bonding numa awareness on multi-socket
server working in active-backeup mode.
The VPP adds capability for automatically preferring slave
with local numa node in order to reduces the load on the
QPI-bus and improve system overall performance in multi-socket
use cases. Users doesn't need to add any extra operation as
usual.

Change-Id: Iec267375fc399a9a0c0a7dca649fadb994d36671
Signed-off-by: Zhiyong Yang <zhiyong.yang@intel.com>
diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c
index fccefb1..2e4c8a2 100644
--- a/src/plugins/dpdk/device/init.c
+++ b/src/plugins/dpdk/device/init.c
@@ -732,6 +732,7 @@
 	{
 	  hi->max_packet_bytes = mtu;
 	  hi->max_supported_packet_bytes = max_rx_frame;
+	  hi->numa_node = xd->cpu_socket;
 	}
 
       if (dm->conf->no_tx_checksum_offload == 0)
diff --git a/src/vnet/bonding/cli.c b/src/vnet/bonding/cli.c
index bccbb2c..cb344c6 100644
--- a/src/vnet/bonding/cli.c
+++ b/src/vnet/bonding/cli.c
@@ -28,6 +28,8 @@
   bond_if_t *bif;
   int i;
   uword p;
+  vnet_main_t *vnm = vnet_get_main ();
+  vnet_hw_interface_t *hw;
   u8 switching_active = 0;
 
   bif = bond_get_master_by_dev_instance (sif->bif_dev_instance);
@@ -37,22 +39,53 @@
     p = *vec_elt_at_index (bif->active_slaves, i);
     if (p == sif->sw_if_index)
       {
-	/* Are we disabling the very 1st slave? */
-	if (sif->sw_if_index == *vec_elt_at_index (bif->active_slaves, 0))
-	  switching_active = 1;
-
+	if (sif->sw_if_index == bif->sw_if_index_working)
+	  {
+	    switching_active = 1;
+	    if (bif->mode == BOND_MODE_ACTIVE_BACKUP)
+	      bif->is_local_numa = 0;
+	  }
 	vec_del1 (bif->active_slaves, i);
 	hash_unset (bif->active_slave_by_sw_if_index, sif->sw_if_index);
-
-	/* We got a new slave just becoming active? */
-	if ((vec_len (bif->active_slaves) >= 1) &&
-	    (bif->mode == BOND_MODE_ACTIVE_BACKUP) && switching_active)
-	  vlib_process_signal_event (bm->vlib_main, bond_process_node.index,
-				     BOND_SEND_GARP_NA, bif->hw_if_index);
 	break;
       }
   }
+
+  /* We get a new slave just becoming active */
+  if ((bif->mode == BOND_MODE_ACTIVE_BACKUP) && switching_active)
+    {
+      if ((vec_len (bif->active_slaves) >= 1))
+	{
+	  /* scan all slaves and try to find the first slave with local numa node. */
+	  vec_foreach_index (i, bif->active_slaves)
+	  {
+	    p = *vec_elt_at_index (bif->active_slaves, i);
+	    hw = vnet_get_sup_hw_interface (vnm, p);
+	    if (vm->numa_node == hw->numa_node)
+	      {
+		bif->sw_if_index_working = p;
+		bif->is_local_numa = 1;
+		vlib_process_signal_event (bm->vlib_main,
+					   bond_process_node.index,
+					   BOND_SEND_GARP_NA,
+					   bif->hw_if_index);
+		break;
+	      }
+	  }
+	}
+
+      /* No local numa node is found in the active slave set. Use the first slave */
+      if ((bif->is_local_numa == 0) && (vec_len (bif->active_slaves) >= 1))
+	{
+	  p = *vec_elt_at_index (bif->active_slaves, 0);
+	  bif->sw_if_index_working = p;
+	  vlib_process_signal_event (bm->vlib_main, bond_process_node.index,
+				     BOND_SEND_GARP_NA, bif->hw_if_index);
+	}
+    }
   clib_spinlock_unlock_if_init (&bif->lockp);
+
+  return;
 }
 
 void
@@ -60,6 +93,10 @@
 {
   bond_if_t *bif;
   bond_main_t *bm = &bond_main;
+  vnet_main_t *vnm = vnet_get_main ();
+  vnet_hw_interface_t *hw = vnet_get_sup_hw_interface (vnm, sif->sw_if_index);
+  int i;
+  uword p;
 
   bif = bond_get_master_by_dev_instance (sif->bif_dev_instance);
   clib_spinlock_lock_if_init (&bif->lockp);
@@ -72,10 +109,41 @@
       /* First slave becomes active? */
       if ((vec_len (bif->active_slaves) == 1) &&
 	  (bif->mode == BOND_MODE_ACTIVE_BACKUP))
-	vlib_process_signal_event (bm->vlib_main, bond_process_node.index,
-				   BOND_SEND_GARP_NA, bif->hw_if_index);
+	{
+	  bif->sw_if_index_working = sif->sw_if_index;
+	  bif->is_local_numa = (vm->numa_node == hw->numa_node) ? 1 : 0;
+	  vlib_process_signal_event (bm->vlib_main, bond_process_node.index,
+				     BOND_SEND_GARP_NA, bif->hw_if_index);
+	}
+      else if ((vec_len (bif->active_slaves) > 1)
+	       && (bif->mode == BOND_MODE_ACTIVE_BACKUP)
+	       && bif->is_local_numa == 0)
+	{
+	  if (vm->numa_node == hw->numa_node)
+	    {
+	      vec_foreach_index (i, bif->active_slaves)
+	      {
+		p = *vec_elt_at_index (bif->active_slaves, 0);
+		if (p == sif->sw_if_index)
+		  break;
+
+		vec_del1 (bif->active_slaves, 0);
+		hash_unset (bif->active_slave_by_sw_if_index, p);
+		vec_add1 (bif->active_slaves, p);
+		hash_set (bif->active_slave_by_sw_if_index, p, p);
+	      }
+	      bif->sw_if_index_working = sif->sw_if_index;
+	      bif->is_local_numa = 1;
+	      vlib_process_signal_event (bm->vlib_main,
+					 bond_process_node.index,
+					 BOND_SEND_GARP_NA, bif->hw_if_index);
+
+	    }
+	}
     }
   clib_spinlock_unlock_if_init (&bif->lockp);
+
+  return;
 }
 
 int
diff --git a/src/vnet/bonding/node.h b/src/vnet/bonding/node.h
index 41e945a..b046f98 100644
--- a/src/vnet/bonding/node.h
+++ b/src/vnet/bonding/node.h
@@ -157,6 +157,11 @@
   u8 mode;
   u8 lb;
 
+  /* This flag works for active-backup mode only
+     and marks if the working port is local numa. */
+  u8 is_local_numa;
+  /* current working sw_if_index in active-bakeup mode. */
+  u32 sw_if_index_working;
   /* the last slave index for the rr lb */
   u32 lb_rr_last_index;
 
diff --git a/src/vnet/interface.h b/src/vnet/interface.h
index c32311c..d87de60 100644
--- a/src/vnet/interface.h
+++ b/src/vnet/interface.h
@@ -565,6 +565,10 @@
   /* device input device_and_queue runtime index */
   uword *dq_runtime_index_by_queue;
 
+  /* numa node that hardware device connects to */
+  u8 numa_node;
+
+  u8 padding[3];
 } vnet_hw_interface_t;
 
 extern vnet_device_class_t vnet_local_interface_device_class;