bonding: add support for numa awareness
This patch enables bonding numa awareness on multi-socket
server working in active-backeup mode.
The VPP adds capability for automatically preferring slave
with local numa node in order to reduces the load on the
QPI-bus and improve system overall performance in multi-socket
use cases. Users doesn't need to add any extra operation as
usual.
Change-Id: Iec267375fc399a9a0c0a7dca649fadb994d36671
Signed-off-by: Zhiyong Yang <zhiyong.yang@intel.com>
diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c
index fccefb1..2e4c8a2 100644
--- a/src/plugins/dpdk/device/init.c
+++ b/src/plugins/dpdk/device/init.c
@@ -732,6 +732,7 @@
{
hi->max_packet_bytes = mtu;
hi->max_supported_packet_bytes = max_rx_frame;
+ hi->numa_node = xd->cpu_socket;
}
if (dm->conf->no_tx_checksum_offload == 0)
diff --git a/src/vnet/bonding/cli.c b/src/vnet/bonding/cli.c
index bccbb2c..cb344c6 100644
--- a/src/vnet/bonding/cli.c
+++ b/src/vnet/bonding/cli.c
@@ -28,6 +28,8 @@
bond_if_t *bif;
int i;
uword p;
+ vnet_main_t *vnm = vnet_get_main ();
+ vnet_hw_interface_t *hw;
u8 switching_active = 0;
bif = bond_get_master_by_dev_instance (sif->bif_dev_instance);
@@ -37,22 +39,53 @@
p = *vec_elt_at_index (bif->active_slaves, i);
if (p == sif->sw_if_index)
{
- /* Are we disabling the very 1st slave? */
- if (sif->sw_if_index == *vec_elt_at_index (bif->active_slaves, 0))
- switching_active = 1;
-
+ if (sif->sw_if_index == bif->sw_if_index_working)
+ {
+ switching_active = 1;
+ if (bif->mode == BOND_MODE_ACTIVE_BACKUP)
+ bif->is_local_numa = 0;
+ }
vec_del1 (bif->active_slaves, i);
hash_unset (bif->active_slave_by_sw_if_index, sif->sw_if_index);
-
- /* We got a new slave just becoming active? */
- if ((vec_len (bif->active_slaves) >= 1) &&
- (bif->mode == BOND_MODE_ACTIVE_BACKUP) && switching_active)
- vlib_process_signal_event (bm->vlib_main, bond_process_node.index,
- BOND_SEND_GARP_NA, bif->hw_if_index);
break;
}
}
+
+ /* We get a new slave just becoming active */
+ if ((bif->mode == BOND_MODE_ACTIVE_BACKUP) && switching_active)
+ {
+ if ((vec_len (bif->active_slaves) >= 1))
+ {
+ /* scan all slaves and try to find the first slave with local numa node. */
+ vec_foreach_index (i, bif->active_slaves)
+ {
+ p = *vec_elt_at_index (bif->active_slaves, i);
+ hw = vnet_get_sup_hw_interface (vnm, p);
+ if (vm->numa_node == hw->numa_node)
+ {
+ bif->sw_if_index_working = p;
+ bif->is_local_numa = 1;
+ vlib_process_signal_event (bm->vlib_main,
+ bond_process_node.index,
+ BOND_SEND_GARP_NA,
+ bif->hw_if_index);
+ break;
+ }
+ }
+ }
+
+ /* No local numa node is found in the active slave set. Use the first slave */
+ if ((bif->is_local_numa == 0) && (vec_len (bif->active_slaves) >= 1))
+ {
+ p = *vec_elt_at_index (bif->active_slaves, 0);
+ bif->sw_if_index_working = p;
+ vlib_process_signal_event (bm->vlib_main, bond_process_node.index,
+ BOND_SEND_GARP_NA, bif->hw_if_index);
+ }
+ }
clib_spinlock_unlock_if_init (&bif->lockp);
+
+ return;
}
void
@@ -60,6 +93,10 @@
{
bond_if_t *bif;
bond_main_t *bm = &bond_main;
+ vnet_main_t *vnm = vnet_get_main ();
+ vnet_hw_interface_t *hw = vnet_get_sup_hw_interface (vnm, sif->sw_if_index);
+ int i;
+ uword p;
bif = bond_get_master_by_dev_instance (sif->bif_dev_instance);
clib_spinlock_lock_if_init (&bif->lockp);
@@ -72,10 +109,41 @@
/* First slave becomes active? */
if ((vec_len (bif->active_slaves) == 1) &&
(bif->mode == BOND_MODE_ACTIVE_BACKUP))
- vlib_process_signal_event (bm->vlib_main, bond_process_node.index,
- BOND_SEND_GARP_NA, bif->hw_if_index);
+ {
+ bif->sw_if_index_working = sif->sw_if_index;
+ bif->is_local_numa = (vm->numa_node == hw->numa_node) ? 1 : 0;
+ vlib_process_signal_event (bm->vlib_main, bond_process_node.index,
+ BOND_SEND_GARP_NA, bif->hw_if_index);
+ }
+ else if ((vec_len (bif->active_slaves) > 1)
+ && (bif->mode == BOND_MODE_ACTIVE_BACKUP)
+ && bif->is_local_numa == 0)
+ {
+ if (vm->numa_node == hw->numa_node)
+ {
+ vec_foreach_index (i, bif->active_slaves)
+ {
+ p = *vec_elt_at_index (bif->active_slaves, 0);
+ if (p == sif->sw_if_index)
+ break;
+
+ vec_del1 (bif->active_slaves, 0);
+ hash_unset (bif->active_slave_by_sw_if_index, p);
+ vec_add1 (bif->active_slaves, p);
+ hash_set (bif->active_slave_by_sw_if_index, p, p);
+ }
+ bif->sw_if_index_working = sif->sw_if_index;
+ bif->is_local_numa = 1;
+ vlib_process_signal_event (bm->vlib_main,
+ bond_process_node.index,
+ BOND_SEND_GARP_NA, bif->hw_if_index);
+
+ }
+ }
}
clib_spinlock_unlock_if_init (&bif->lockp);
+
+ return;
}
int
diff --git a/src/vnet/bonding/node.h b/src/vnet/bonding/node.h
index 41e945a..b046f98 100644
--- a/src/vnet/bonding/node.h
+++ b/src/vnet/bonding/node.h
@@ -157,6 +157,11 @@
u8 mode;
u8 lb;
+ /* This flag works for active-backup mode only
+ and marks if the working port is local numa. */
+ u8 is_local_numa;
+ /* current working sw_if_index in active-bakeup mode. */
+ u32 sw_if_index_working;
/* the last slave index for the rr lb */
u32 lb_rr_last_index;
diff --git a/src/vnet/interface.h b/src/vnet/interface.h
index c32311c..d87de60 100644
--- a/src/vnet/interface.h
+++ b/src/vnet/interface.h
@@ -565,6 +565,10 @@
/* device input device_and_queue runtime index */
uword *dq_runtime_index_by_queue;
+ /* numa node that hardware device connects to */
+ u8 numa_node;
+
+ u8 padding[3];
} vnet_hw_interface_t;
extern vnet_device_class_t vnet_local_interface_device_class;