vnet: add device-input threadplacement infra

This change adds two new debug CLI command:

- "show interface placmenet" to display which
thread (main or worker) is responsible for processing
interface rx queue

vpp# show interface placement
Thread 0 (vpp_main):
  node af-packet-input:
    host-vpp1 queue 0
Thread 1 (vpp_wk_0):
  node af-packet-input:
    host-virbr0 queue 0
Thread 2 (vpp_wk_1):
  node af-packet-input:
    host-vpp2 queue 0
    host-lxcbr0 queue 0

- "set interface placmenet" to assign thread (main or worker)
which process specific interface rx queue

vpp# set interface placement host-vpp1 queue 0 main

Change-Id: Id4dd00cf2b05e10fae2125ac7cb4411b446c5e9c
Signed-off-by: Damjan Marion <damarion@cisco.com>
diff --git a/src/vlib/threads.c b/src/vlib/threads.c
index 40789f5..ef3a24d 100644
--- a/src/vlib/threads.c
+++ b/src/vlib/threads.c
@@ -685,9 +685,6 @@
 		  clib_memcpy (rt->runtime_data, n->runtime_data,
 			       clib_min (VLIB_NODE_RUNTIME_DATA_SIZE,
 					 n->runtime_data_bytes));
-		else if (CLIB_DEBUG > 0)
-		  memset (rt->runtime_data, 0xfe,
-			  VLIB_NODE_RUNTIME_DATA_SIZE);
 	      }
 
 	      nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT] =
@@ -701,9 +698,6 @@
 		  clib_memcpy (rt->runtime_data, n->runtime_data,
 			       clib_min (VLIB_NODE_RUNTIME_DATA_SIZE,
 					 n->runtime_data_bytes));
-		else if (CLIB_DEBUG > 0)
-		  memset (rt->runtime_data, 0xfe,
-			  VLIB_NODE_RUNTIME_DATA_SIZE);
 	      }
 
 	      nm_clone->processes = vec_dup (nm->processes);
@@ -1405,15 +1399,15 @@
   clib_time_init (&vm->clib_time);
   clib_mem_set_heap (w->thread_mheap);
 
+  /* Wait until the dpdk init sequence is complete */
+  while (tm->extern_thread_mgmt && tm->worker_thread_release == 0)
+    vlib_worker_thread_barrier_check ();
+
   e = vlib_call_init_exit_functions
     (vm, vm->worker_init_function_registrations, 1 /* call_once */ );
   if (e)
     clib_error_report (e);
 
-  /* Wait until the dpdk init sequence is complete */
-  while (tm->extern_thread_mgmt && tm->worker_thread_release == 0)
-    vlib_worker_thread_barrier_check ();
-
   vlib_worker_loop (vm);
 }
 
diff --git a/src/vnet/devices/af_packet/af_packet.c b/src/vnet/devices/af_packet/af_packet.c
index e491ba4..5fdc59f 100644
--- a/src/vnet/devices/af_packet/af_packet.c
+++ b/src/vnet/devices/af_packet/af_packet.c
@@ -67,15 +67,16 @@
 static clib_error_t *
 af_packet_fd_read_ready (unix_file_t * uf)
 {
-  vlib_main_t *vm = vlib_get_main ();
   af_packet_main_t *apm = &af_packet_main;
+  vnet_main_t *vnm = vnet_get_main ();
   u32 idx = uf->private_data;
+  af_packet_if_t *apif = pool_elt_at_index (apm->interfaces, idx);
 
   apm->pending_input_bitmap =
     clib_bitmap_set (apm->pending_input_bitmap, idx, 1);
 
   /* Schedule the rx node */
-  vlib_node_set_interrupt_pending (vm, af_packet_input_node.index);
+  vnet_device_input_set_interrupt_pending (vnm, apif->hw_if_index, 0);
 
   return 0;
 }
@@ -171,31 +172,6 @@
   return ret;
 }
 
-static void
-af_packet_worker_thread_enable ()
-{
-  /* If worker threads are enabled, switch to polling mode */
-  foreach_vlib_main ((
-		       {
-		       vlib_node_set_state (this_vlib_main,
-					    af_packet_input_node.index,
-					    VLIB_NODE_STATE_POLLING);
-		       }));
-
-}
-
-static void
-af_packet_worker_thread_disable ()
-{
-  foreach_vlib_main ((
-		       {
-		       vlib_node_set_state (this_vlib_main,
-					    af_packet_input_node.index,
-					    VLIB_NODE_STATE_INTERRUPT);
-		       }));
-
-}
-
 int
 af_packet_create_if (vlib_main_t * vm, u8 * host_if_name, u8 * hw_addr_set,
 		     u32 * sw_if_index)
@@ -298,6 +274,9 @@
 
   sw = vnet_get_hw_sw_interface (vnm, apif->hw_if_index);
   apif->sw_if_index = sw->sw_if_index;
+  vnet_set_device_input_node (apif->hw_if_index, af_packet_input_node.index);
+  vnet_device_input_assign_thread (apif->hw_if_index, 0,	/* queue */
+				   ~0 /* any cpu */ );
 
   vnet_hw_interface_set_flags (vnm, apif->hw_if_index,
 			       VNET_HW_INTERFACE_FLAG_LINK_UP);
@@ -307,9 +286,6 @@
   if (sw_if_index)
     *sw_if_index = apif->sw_if_index;
 
-  if (tm->n_vlib_mains > 1 && pool_elts (apm->interfaces) == 1)
-    af_packet_worker_thread_enable ();
-
   return 0;
 
 error:
@@ -323,7 +299,6 @@
 af_packet_delete_if (vlib_main_t * vm, u8 * host_if_name)
 {
   vnet_main_t *vnm = vnet_get_main ();
-  vlib_thread_main_t *tm = vlib_get_thread_main ();
   af_packet_main_t *apm = &af_packet_main;
   af_packet_if_t *apif;
   uword *p;
@@ -373,8 +348,6 @@
   ethernet_delete_interface (vnm, apif->hw_if_index);
 
   pool_put (apm->interfaces, apif);
-  if (tm->n_vlib_mains > 1 && pool_elts (apm->interfaces) == 0)
-    af_packet_worker_thread_disable ();
 
   return 0;
 }
@@ -384,24 +357,9 @@
 {
   af_packet_main_t *apm = &af_packet_main;
   vlib_thread_main_t *tm = vlib_get_thread_main ();
-  vlib_thread_registration_t *tr;
-  uword *p;
 
   memset (apm, 0, sizeof (af_packet_main_t));
 
-  apm->input_cpu_first_index = 0;
-  apm->input_cpu_count = 1;
-
-  /* find out which cpus will be used for input */
-  p = hash_get_mem (tm->thread_registrations_by_name, "workers");
-  tr = p ? (vlib_thread_registration_t *) p[0] : 0;
-
-  if (tr && tr->count > 0)
-    {
-      apm->input_cpu_first_index = tr->first_index;
-      apm->input_cpu_count = tr->count;
-    }
-
   mhash_init_vec_string (&apm->if_index_by_host_if_name, sizeof (uword));
 
   vec_validate_aligned (apm->rx_buffers, tm->n_vlib_mains - 1,
diff --git a/src/vnet/devices/af_packet/af_packet.h b/src/vnet/devices/af_packet/af_packet.h
index e00e5cb..50ec237 100644
--- a/src/vnet/devices/af_packet/af_packet.h
+++ b/src/vnet/devices/af_packet/af_packet.h
@@ -51,12 +51,6 @@
 
   /* hash of host interface names */
   mhash_t if_index_by_host_if_name;
-
-  /* first cpu index */
-  u32 input_cpu_first_index;
-
-  /* total cpu count */
-  u32 input_cpu_count;
 } af_packet_main_t;
 
 af_packet_main_t af_packet_main;
diff --git a/src/vnet/devices/af_packet/node.c b/src/vnet/devices/af_packet/node.c
index ab7fd80..ba337f3 100644
--- a/src/vnet/devices/af_packet/node.c
+++ b/src/vnet/devices/af_packet/node.c
@@ -246,20 +246,18 @@
 af_packet_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
 		    vlib_frame_t * frame)
 {
-  int i;
   u32 n_rx_packets = 0;
-  u32 cpu_index = os_get_cpu_number ();
   af_packet_main_t *apm = &af_packet_main;
-  af_packet_if_t *apif;
+  vnet_device_input_runtime_t *rt = (void *) node->runtime_data;
+  vnet_device_and_queue_t *dq;
 
-  for (i = 0; i < vec_len (apm->interfaces); i++)
-    {
-      apif = vec_elt_at_index (apm->interfaces, i);
-      if (apif->is_admin_up &&
-	  (i % apm->input_cpu_count) ==
-	  (cpu_index - apm->input_cpu_first_index))
-	n_rx_packets += af_packet_device_input_fn (vm, node, frame, apif);
-    }
+  vec_foreach (dq, rt->devices_and_queues)
+  {
+    af_packet_if_t *apif;
+    apif = vec_elt_at_index (apm->interfaces, dq->dev_instance);
+    if (apif->is_admin_up)
+      n_rx_packets += af_packet_device_input_fn (vm, node, frame, apif);
+  }
 
   return n_rx_packets;
 }
@@ -271,9 +269,6 @@
   .sibling_of = "device-input",
   .format_trace = format_af_packet_input_trace,
   .type = VLIB_NODE_TYPE_INPUT,
-  /**
-   * default state is INTERRUPT mode, switch to POLLING if worker threads are enabled
-   */
   .state = VLIB_NODE_STATE_INTERRUPT,
   .n_errors = AF_PACKET_INPUT_N_ERROR,
   .error_strings = af_packet_input_error_strings,
diff --git a/src/vnet/devices/devices.c b/src/vnet/devices/devices.c
index 38f3002..4164522 100644
--- a/src/vnet/devices/devices.c
+++ b/src/vnet/devices/devices.c
@@ -32,6 +32,7 @@
 VLIB_REGISTER_NODE (device_input_node) = {
   .function = device_input_fn,
   .name = "device-input",
+  .runtime_data_bytes = sizeof (vnet_device_input_runtime_t),
   .type = VLIB_NODE_TYPE_INPUT,
   .state = VLIB_NODE_STATE_DISABLED,
   .n_next_nodes = VNET_DEVICE_INPUT_N_NEXT_NODES,
@@ -83,18 +84,257 @@
 };
 /* *INDENT-ON* */
 
+static int
+vnet_device_queue_sort (void *a1, void *a2)
+{
+  vnet_device_and_queue_t *dq1 = a1;
+  vnet_device_and_queue_t *dq2 = a2;
+
+  if (dq1->dev_instance > dq2->dev_instance)
+    return 1;
+  else if (dq1->dev_instance < dq2->dev_instance)
+    return -1;
+  else if (dq1->queue_id > dq2->queue_id)
+    return 1;
+  else if (dq1->queue_id < dq2->queue_id)
+    return -1;
+  else
+    return 0;
+}
+
+void
+vnet_device_input_assign_thread (u32 hw_if_index,
+				 u16 queue_id, uword cpu_index)
+{
+  vnet_main_t *vnm = vnet_get_main ();
+  vnet_device_main_t *vdm = &vnet_device_main;
+  vlib_main_t *vm;
+  vnet_device_input_runtime_t *rt;
+  vnet_device_and_queue_t *dq;
+  vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+
+  ASSERT (hw->input_node_index > 0);
+
+  if (vdm->first_worker_cpu_index == 0)
+    cpu_index = 0;
+
+  if (cpu_index != 0 &&
+      (cpu_index < vdm->first_worker_cpu_index ||
+       cpu_index > vdm->last_worker_cpu_index))
+    {
+      cpu_index = vdm->next_worker_cpu_index++;
+      if (vdm->next_worker_cpu_index > vdm->last_worker_cpu_index)
+	vdm->next_worker_cpu_index = vdm->first_worker_cpu_index;
+    }
+
+  vm = vlib_mains[cpu_index];
+  rt = vlib_node_get_runtime_data (vm, hw->input_node_index);
+
+  vec_add2 (rt->devices_and_queues, dq, 1);
+  dq->hw_if_index = hw_if_index;
+  dq->dev_instance = hw->dev_instance;
+  dq->queue_id = queue_id;
+
+  vec_sort_with_function (rt->devices_and_queues, vnet_device_queue_sort);
+  vec_validate (hw->input_node_cpu_index_by_queue, queue_id);
+  hw->input_node_cpu_index_by_queue[queue_id] = cpu_index;
+}
+
+static int
+vnet_device_input_unassign_thread (u32 hw_if_index, u16 queue_id,
+				   uword cpu_index)
+{
+  vnet_main_t *vnm = vnet_get_main ();
+  vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+  vnet_device_input_runtime_t *rt;
+  vnet_device_and_queue_t *dq;
+  uword old_cpu_index;
+
+  if (hw->input_node_cpu_index_by_queue == 0)
+    return VNET_API_ERROR_INVALID_INTERFACE;
+
+  if (vec_len (hw->input_node_cpu_index_by_queue) < queue_id + 1)
+    return VNET_API_ERROR_INVALID_INTERFACE;
+
+  old_cpu_index = hw->input_node_cpu_index_by_queue[queue_id];
+
+  if (old_cpu_index == cpu_index)
+    return 0;
+
+  rt =
+    vlib_node_get_runtime_data (vlib_mains[old_cpu_index],
+				hw->input_node_index);
+
+  vec_foreach (dq, rt->devices_and_queues)
+    if (dq->hw_if_index == hw_if_index && dq->queue_id == queue_id)
+    {
+      vec_del1 (rt->devices_and_queues, dq - rt->devices_and_queues);
+      goto deleted;
+    }
+
+  return VNET_API_ERROR_INVALID_INTERFACE;
+
+deleted:
+  vec_sort_with_function (rt->devices_and_queues, vnet_device_queue_sort);
+
+  return 0;
+}
+
+static clib_error_t *
+show_device_placement_fn (vlib_main_t * vm, unformat_input_t * input,
+			  vlib_cli_command_t * cmd)
+{
+  u8 *s = 0;
+  vnet_main_t *vnm = vnet_get_main ();
+  vnet_device_input_runtime_t *rt;
+  vnet_device_and_queue_t *dq;
+  vlib_node_t *pn = vlib_get_node_by_name (vm, (u8 *) "device-input");
+  uword si;
+  int index = 0;
+
+  /* *INDENT-OFF* */
+  foreach_vlib_main (({
+    clib_bitmap_foreach (si, pn->sibling_bitmap,
+      ({
+        rt = vlib_node_get_runtime_data (this_vlib_main, si);
+
+        if (vec_len (rt->devices_and_queues))
+          s = format (s, "  node %U:\n", format_vlib_node_name, vm, si);
+
+        vec_foreach (dq, rt->devices_and_queues)
+	  {
+	    s = format (s, "    %U queue %u\n",
+			format_vnet_sw_if_index_name, vnm, dq->hw_if_index,
+			dq->queue_id);
+	  }
+      }));
+    if (vec_len (s) > 0)
+      {
+        vlib_cli_output(vm, "Thread %u (%v):\n%v", index,
+			vlib_worker_threads[index].name, s);
+        vec_reset_length (s);
+      }
+    index++;
+  }));
+  /* *INDENT-ON* */
+
+  vec_free (s);
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (memif_delete_command, static) = {
+  .path = "show interface placement",
+  .short_help = "show interface placement",
+  .function = show_device_placement_fn,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_device_placement (vlib_main_t * vm, unformat_input_t * input,
+		      vlib_cli_command_t * cmd)
+{
+  clib_error_t *error = 0;
+  unformat_input_t _line_input, *line_input = &_line_input;
+  vnet_main_t *vnm = vnet_get_main ();
+  vnet_device_main_t *vdm = &vnet_device_main;
+  u32 hw_if_index = (u32) ~ 0;
+  u32 queue_id = (u32) 0;
+  u32 cpu_index = (u32) ~ 0;
+  int rv;
+
+  if (!unformat_user (input, unformat_line_input, line_input))
+    return 0;
+
+  while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat
+	  (line_input, "%U", unformat_vnet_hw_interface, vnm, &hw_if_index))
+	;
+      else if (unformat (line_input, "queue %d", &queue_id))
+	;
+      else if (unformat (line_input, "main", &cpu_index))
+	cpu_index = 0;
+      else if (unformat (line_input, "worker %d", &cpu_index))
+	cpu_index += vdm->first_worker_cpu_index;
+      else
+	{
+	  error = clib_error_return (0, "parse error: '%U'",
+				     format_unformat_error, line_input);
+	  unformat_free (line_input);
+	  return error;
+	}
+    }
+
+  unformat_free (line_input);
+
+  if (hw_if_index == (u32) ~ 0)
+    return clib_error_return (0, "please specify valid interface name");
+
+  if (cpu_index > vdm->last_worker_cpu_index)
+    return clib_error_return (0,
+			      "please specify valid worker thread or main");
+
+  rv = vnet_device_input_unassign_thread (hw_if_index, queue_id, cpu_index);
+
+  if (rv)
+    return clib_error_return (0, "not found");
+
+  vnet_device_input_assign_thread (hw_if_index, queue_id, cpu_index);
+
+  return 0;
+}
+
+/*?
+ * This command is used to assign a given interface, and optionally a
+ * given queue, to a different thread. If the '<em>queue</em>' is not provided,
+ * it defaults to 0.
+ *
+ * @cliexpar
+ * Example of how to display the interface placement:
+ * @cliexstart{show interface placement}
+ * Thread 1 (vpp_wk_0):
+ *   GigabitEthernet0/8/0 queue 0
+ *   GigabitEthernet0/9/0 queue 0
+ * Thread 2 (vpp_wk_1):
+ *   GigabitEthernet0/8/0 queue 1
+ *   GigabitEthernet0/9/0 queue 1
+ * @cliexend
+ * Example of how to assign a interface and queue to a thread:
+ * @cliexcmd{set interface placement GigabitEthernet0/8/0 queue 1 thread 1}
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_placement,static) = {
+    .path = "set interface placement",
+    .short_help = "set interface placement <interface> [queue <n>] [thread <n> | main]",
+    .function = set_device_placement,
+};
+/* *INDENT-ON* */
+
 static clib_error_t *
 vnet_device_init (vlib_main_t * vm)
 {
   vnet_device_main_t *vdm = &vnet_device_main;
   vlib_thread_main_t *tm = vlib_get_thread_main ();
+  vlib_thread_registration_t *tr;
+  uword *p;
 
   vec_validate_aligned (vdm->workers, tm->n_vlib_mains - 1,
 			CLIB_CACHE_LINE_BYTES);
+
+  p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+  tr = p ? (vlib_thread_registration_t *) p[0] : 0;
+  if (tr && tr->count > 0)
+    {
+      vdm->first_worker_cpu_index = tr->first_index;
+      vdm->next_worker_cpu_index = tr->first_index;
+      vdm->last_worker_cpu_index = tr->first_index + tr->count - 1;
+    }
   return 0;
 }
 
 VLIB_INIT_FUNCTION (vnet_device_init);
+
 /*
  * fd.io coding-style-patch-verification: ON
  *
diff --git a/src/vnet/devices/devices.h b/src/vnet/devices/devices.h
index a5cbc35..bbb29fe 100644
--- a/src/vnet/devices/devices.h
+++ b/src/vnet/devices/devices.h
@@ -50,12 +50,38 @@
 typedef struct
 {
   vnet_device_per_worker_data_t *workers;
+  uword first_worker_cpu_index;
+  uword last_worker_cpu_index;
+  uword next_worker_cpu_index;
 } vnet_device_main_t;
 
+typedef struct
+{
+  u32 hw_if_index;
+  u32 dev_instance;
+  u16 queue_id;
+} vnet_device_and_queue_t;
+
+typedef struct
+{
+  vnet_device_and_queue_t *devices_and_queues;
+} vnet_device_input_runtime_t;
+
 extern vnet_device_main_t vnet_device_main;
 extern vlib_node_registration_t device_input_node;
 extern const u32 device_input_next_node_advance[];
 
+static inline void
+vnet_set_device_input_node (u32 hw_if_index, u32 node_index)
+{
+  vnet_main_t *vnm = vnet_get_main ();
+  vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+  hw->input_node_index = node_index;
+}
+
+void vnet_device_input_assign_thread (u32 hw_if_index, u16 queue_id,
+				      uword cpu_index);
+
 static inline u64
 vnet_get_aggregate_rx_packets (void)
 {
@@ -78,6 +104,25 @@
   pwd->aggregate_rx_packets += count;
 }
 
+static_always_inline vnet_device_and_queue_t *
+vnet_get_device_and_queue (vlib_main_t * vm, vlib_node_runtime_t * node)
+{
+  vnet_device_input_runtime_t *rt = (void *) node->runtime_data;
+  return rt->devices_and_queues;
+}
+
+static_always_inline void
+vnet_device_input_set_interrupt_pending (vnet_main_t * vnm, u32 hw_if_index,
+					 u16 queue_id)
+{
+  vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+
+  ASSERT (queue_id < vec_len (hw->input_node_cpu_index_by_queue));
+  u32 cpu_index = hw->input_node_cpu_index_by_queue[queue_id];
+  vlib_node_set_interrupt_pending (vlib_mains[cpu_index],
+				   hw->input_node_index);
+}
+
 #endif /* included_vnet_vnet_device_h */
 
 /*
diff --git a/src/vnet/interface.h b/src/vnet/interface.h
index ef8f911..a1ea2d6 100644
--- a/src/vnet/interface.h
+++ b/src/vnet/interface.h
@@ -464,6 +464,12 @@
 #define VNET_HW_INTERFACE_BOND_INFO_NONE ((uword *) 0)
 #define VNET_HW_INTERFACE_BOND_INFO_SLAVE ((uword *) ~0)
 
+  /* Input node */
+  u32 input_node_index;
+
+  /* input node cpu index by queue */
+  u32 *input_node_cpu_index_by_queue;
+
 } vnet_hw_interface_t;
 
 extern vnet_device_class_t vnet_local_interface_device_class;