interface: tx queue infra

Type: improvement
Change-Id: I415b2f980de10ca3154d2c8677c24792453eccd0
Signed-off-by: Damjan Marion <damarion@cisco.com>
diff --git a/src/plugins/avf/avf.h b/src/plugins/avf/avf.h
index ea931dc..c03a7c2 100644
--- a/src/plugins/avf/avf.h
+++ b/src/plugins/avf/avf.h
@@ -191,6 +191,7 @@
 
   avf_tx_desc_t *tmp_descs;
   u32 *tmp_bufs;
+  u32 queue_index;
 } avf_txq_t;
 
 typedef struct
diff --git a/src/plugins/avf/device.c b/src/plugins/avf/device.c
index 70ea446..05946a4 100644
--- a/src/plugins/avf/device.c
+++ b/src/plugins/avf/device.c
@@ -21,6 +21,7 @@
 #include <vlib/pci/pci.h>
 #include <vnet/ethernet/ethernet.h>
 #include <vnet/interface/rx_queue_funcs.h>
+#include <vnet/interface/tx_queue_funcs.h>
 
 #include <avf/avf.h>
 
@@ -303,8 +304,7 @@
     {
       qid = qid % ad->num_queue_pairs;
       txq = vec_elt_at_index (ad->txqs, qid);
-      if (txq->lock == 0)
-	clib_spinlock_init (&txq->lock);
+      clib_spinlock_init (&txq->lock);
       ad->flags |= AVF_DEVICE_F_SHARED_TXQ_LOCK;
       return 0;
     }
@@ -1748,6 +1748,14 @@
 	}
       ad->rxqs[i].queue_index = qi;
     }
+
+  for (i = 0; i < ad->n_tx_queues; i++)
+    {
+      u32 qi = vnet_hw_if_register_tx_queue (vnm, ad->hw_if_index, i);
+      vnet_hw_if_tx_queue_assign_thread (vnm, qi, i);
+      ad->txqs[i].queue_index = qi;
+    }
+
   vnet_hw_if_update_runtime_data (vnm, ad->hw_if_index);
 
   if (pool_elts (am->devices) == 1)
diff --git a/src/plugins/avf/output.c b/src/plugins/avf/output.c
index 7804335..4cc9d5a 100644
--- a/src/plugins/avf/output.c
+++ b/src/plugins/avf/output.c
@@ -375,16 +375,17 @@
 {
   vnet_interface_output_runtime_t *rd = (void *) node->runtime_data;
   avf_device_t *ad = avf_get_device (rd->dev_instance);
-  u32 thread_index = vm->thread_index;
-  u8 qid = thread_index;
-  avf_txq_t *txq = vec_elt_at_index (ad->txqs, qid % ad->num_queue_pairs);
+  vnet_hw_if_tx_frame_t *tf = vlib_frame_scalar_args (frame);
+  u8 qid = tf->queue_id;
+  avf_txq_t *txq = vec_elt_at_index (ad->txqs, qid);
   u16 next;
   u16 mask = txq->size - 1;
   u32 *buffers = vlib_frame_vector_args (frame);
   u16 n_enq, n_left, n_desc, *slot;
   u16 n_retry = 2;
 
-  clib_spinlock_lock_if_init (&txq->lock);
+  if (tf->shared_queue)
+    clib_spinlock_lock (&txq->lock);
 
   n_left = frame->n_vectors;
 
@@ -474,7 +475,8 @@
 			AVF_TX_ERROR_NO_FREE_SLOTS, n_left);
     }
 
-  clib_spinlock_unlock_if_init (&txq->lock);
+  if (tf->shared_queue)
+    clib_spinlock_unlock (&txq->lock);
 
   return frame->n_vectors - n_left;
 }
diff --git a/src/vnet/CMakeLists.txt b/src/vnet/CMakeLists.txt
index 6e02efd..03ace21 100644
--- a/src/vnet/CMakeLists.txt
+++ b/src/vnet/CMakeLists.txt
@@ -37,6 +37,7 @@
   interface_format.c
   interface_output.c
   interface/rx_queue.c
+  interface/tx_queue.c
   interface/runtime.c
   interface_stats.c
   misc.c
@@ -58,6 +59,7 @@
   global_funcs.h
   handoff.h
   interface/rx_queue_funcs.h
+  interface/tx_queue_funcs.h
   interface.h
   interface_funcs.h
   interface_output.h
diff --git a/src/vnet/interface.c b/src/vnet/interface.c
index 1d59a96..5c0ccaa 100644
--- a/src/vnet/interface.c
+++ b/src/vnet/interface.c
@@ -43,6 +43,7 @@
 #include <vnet/adj/adj_mcast.h>
 #include <vnet/ip/ip.h>
 #include <vnet/interface/rx_queue_funcs.h>
+#include <vnet/interface/tx_queue_funcs.h>
 
 /* *INDENT-OFF* */
 VLIB_REGISTER_LOG_CLASS (if_default_log, static) = {
@@ -958,7 +959,7 @@
       r.type = VLIB_NODE_TYPE_INTERNAL;
       r.runtime_data = &rt;
       r.runtime_data_bytes = sizeof (rt);
-      r.scalar_size = 0;
+      r.scalar_size = sizeof (vnet_hw_if_tx_frame_t);
       r.vector_size = sizeof (u32);
 
       r.flags = VLIB_NODE_FLAG_IS_OUTPUT;
@@ -1062,8 +1063,9 @@
   /* Call delete callbacks. */
   call_hw_interface_add_del_callbacks (vnm, hw_if_index, /* is_create */ 0);
 
-  /* delete rx queues */
+  /* delete rx & tx queues */
   vnet_hw_if_unregister_all_rx_queues (vnm, hw_if_index);
+  vnet_hw_if_unregister_all_tx_queues (vnm, hw_if_index);
   vnet_hw_if_update_runtime_data (vnm, hw_if_index);
 
   /* Delete any sub-interfaces. */
@@ -1407,6 +1409,8 @@
 
   im->rxq_index_by_hw_if_index_and_queue_id =
     hash_create_mem (0, sizeof (u64), sizeof (u32));
+  im->txq_index_by_hw_if_index_and_queue_id =
+    hash_create_mem (0, sizeof (u64), sizeof (u32));
   im->sw_if_index_by_sup_and_sub = hash_create_mem (0, sizeof (u64),
 						    sizeof (uword));
   {
diff --git a/src/vnet/interface.h b/src/vnet/interface.h
index 0ffaffe..f65b653 100644
--- a/src/vnet/interface.h
+++ b/src/vnet/interface.h
@@ -599,6 +599,39 @@
 #define VNET_HW_IF_RXQ_NO_RX_INTERRUPT ~0
 } vnet_hw_if_rx_queue_t;
 
+typedef struct
+{
+  u8 shared_queue : 1;
+  /* hw interface index */
+  u32 hw_if_index;
+
+  /* hardware queue identifier */
+  u32 queue_id;
+
+  /* bitmap of threads which use this queue */
+  clib_bitmap_t *threads;
+} vnet_hw_if_tx_queue_t;
+
+typedef enum
+{
+  VNET_HW_IF_TX_FRAME_HINT_NOT_CHAINED = (1 << 0),
+  VNET_HW_IF_TX_FRAME_HINT_NO_GSO = (1 << 1),
+  VNET_HW_IF_TX_FRAME_HINT_NO_CKSUM_OFFLOAD = (1 << 2),
+} vnet_hw_if_tx_frame_hint_t;
+
+typedef struct
+{
+  u8 shared_queue : 1;
+  vnet_hw_if_tx_frame_hint_t hints : 16;
+  u32 queue_id;
+} vnet_hw_if_tx_frame_t;
+
+typedef struct
+{
+  CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+  vnet_hw_if_tx_frame_t frame;
+} vnet_hw_if_output_node_runtime_t;
+
 /* Hardware-interface.  This corresponds to a physical wire
    that packets flow over. */
 typedef struct vnet_hw_interface_t
@@ -635,6 +668,9 @@
   /* Software index for this hardware interface. */
   u32 sw_if_index;
 
+  /* per thread output-node runtimes */
+  vnet_hw_if_output_node_runtime_t *output_node_thread_runtimes;
+
   CLIB_CACHE_LINE_ALIGN_MARK (cacheline1);
 
   /* Interface name. */
@@ -688,6 +724,9 @@
   /* rx queues */
   u32 *rx_queue_indices;
 
+  /* tx queues */
+  u32 *tx_queue_indices;
+
   /* numa node that hardware device connects to */
   u8 numa_node;
 
@@ -955,6 +994,10 @@
   vnet_hw_if_rx_queue_t *hw_if_rx_queues;
   uword *rxq_index_by_hw_if_index_and_queue_id;
 
+  /* Hardware interface TX queues */
+  vnet_hw_if_tx_queue_t *hw_if_tx_queues;
+  uword *txq_index_by_hw_if_index_and_queue_id;
+
   /* Hash table mapping HW interface name to index. */
   uword *hw_interface_by_name;
 
diff --git a/src/vnet/interface/runtime.c b/src/vnet/interface/runtime.c
index 20ac51f..4fb24bf 100644
--- a/src/vnet/interface/runtime.c
+++ b/src/vnet/interface/runtime.c
@@ -19,6 +19,7 @@
 #include <vnet/ip/ip.h>
 #include <vnet/ethernet/ethernet.h>
 #include <vnet/interface/rx_queue_funcs.h>
+#include <vnet/interface/tx_queue_funcs.h>
 #include <vlib/unix/unix.h>
 
 VLIB_REGISTER_LOG_CLASS (if_rxq_log, static) = {
@@ -62,10 +63,12 @@
   u32 node_index = hi->input_node_index;
   vnet_hw_if_rx_queue_t *rxq;
   vnet_hw_if_rxq_poll_vector_t *pv, **d = 0;
+  vnet_hw_if_output_node_runtime_t *new_out_runtimes = 0;
   vlib_node_state_t *per_thread_node_state = 0;
   u32 n_threads = vlib_get_n_threads ();
   u16 *per_thread_node_adaptive = 0;
-  int something_changed = 0;
+  int something_changed_on_rx = 0;
+  int something_changed_on_tx = 0;
   clib_bitmap_t *pending_int = 0;
   int last_int = -1;
 
@@ -81,13 +84,14 @@
   pool_foreach (rxq, im->hw_if_rx_queues)
     {
       u32 ti = rxq->thread_index;
+      vnet_hw_interface_t *rxq_hi;
 
       ASSERT (rxq->mode != VNET_HW_IF_RX_MODE_UNKNOWN);
       ASSERT (rxq->mode != VNET_HW_IF_RX_MODE_DEFAULT);
 
-      hi = vnet_get_hw_interface (vnm, rxq->hw_if_index);
+      rxq_hi = vnet_get_hw_interface (vnm, rxq->hw_if_index);
 
-      if (hi->input_node_index != node_index)
+      if (rxq_hi->input_node_index != node_index)
 	continue;
 
       if (rxq->mode == VNET_HW_IF_RX_MODE_POLLING)
@@ -111,10 +115,11 @@
   pool_foreach (rxq, im->hw_if_rx_queues)
     {
       u32 ti = rxq->thread_index;
+      vnet_hw_interface_t *rxq_hi;
 
-      hi = vnet_get_hw_interface (vnm, rxq->hw_if_index);
+      rxq_hi = vnet_get_hw_interface (vnm, rxq->hw_if_index);
 
-      if (hi->input_node_index != node_index)
+      if (rxq_hi->input_node_index != node_index)
 	continue;
 
       if (rxq->mode == VNET_HW_IF_RX_MODE_INTERRUPT ||
@@ -140,7 +145,7 @@
       old_state = vlib_node_get_state (ovm, node_index);
       if (per_thread_node_state[i] != old_state)
 	{
-	  something_changed = 1;
+	  something_changed_on_rx = 1;
 	  log_debug ("state changed for node %U on thread %u from %s to %s",
 		     format_vlib_node_name, vm, node_index, i,
 		     node_state_str[old_state],
@@ -148,21 +153,48 @@
 	}
 
       /* check if something changed */
-      if (something_changed == 0)
+      if (something_changed_on_rx == 0)
 	{
 	  vnet_hw_if_rx_node_runtime_t *rt;
 	  rt = vlib_node_get_runtime_data (ovm, node_index);
 	  if (vec_len (rt->rxq_poll_vector) != vec_len (d[i]))
-	    something_changed = 1;
+	    something_changed_on_rx = 1;
 	  else if (memcmp (d[i], rt->rxq_poll_vector,
 			   vec_len (d[i]) * sizeof (*d)))
-	    something_changed = 1;
+	    something_changed_on_rx = 1;
 	  if (clib_interrupt_get_n_int (rt->rxq_interrupts) != last_int + 1)
-	    something_changed = 1;
+	    something_changed_on_rx = 1;
 	}
     }
 
-  if (something_changed)
+  new_out_runtimes =
+    vec_dup_aligned (hi->output_node_thread_runtimes, CLIB_CACHE_LINE_BYTES);
+  vec_validate_aligned (new_out_runtimes, n_threads, CLIB_CACHE_LINE_BYTES);
+
+  for (int i = 0; i < vec_len (hi->tx_queue_indices); i++)
+    {
+      u32 thread_index;
+      u32 queue_index = hi->tx_queue_indices[i];
+      vnet_hw_if_tx_queue_t *txq = vnet_hw_if_get_tx_queue (vnm, queue_index);
+
+      clib_bitmap_foreach (thread_index, txq->threads)
+	{
+	  vnet_hw_if_output_node_runtime_t *rt;
+	  rt = vec_elt_at_index (new_out_runtimes, thread_index);
+	  if ((rt->frame.queue_id != txq->queue_id) ||
+	      (rt->frame.shared_queue != txq->shared_queue))
+	    {
+	      log_debug ("tx queue data changed for interface %v, thread %u "
+			 "(queue_id %u -> %u, shared_queue %u -> %u)",
+			 hi->name, thread_index, rt->frame.queue_id,
+			 txq->queue_id, rt->frame.shared_queue,
+			 txq->shared_queue);
+	      something_changed_on_tx = 1;
+	    }
+	}
+    }
+
+  if (something_changed_on_rx || something_changed_on_tx)
     {
       int with_barrier;
 
@@ -177,35 +209,46 @@
       if (with_barrier)
 	vlib_worker_thread_barrier_sync (vm);
 
-      for (int i = 0; i < n_threads; i++)
+      if (something_changed_on_rx)
 	{
-	  vlib_main_t *vm = vlib_get_main_by_index (i);
-	  vnet_hw_if_rx_node_runtime_t *rt;
-	  rt = vlib_node_get_runtime_data (vm, node_index);
-	  pv = rt->rxq_poll_vector;
-	  rt->rxq_poll_vector = d[i];
-	  d[i] = pv;
-
-	  if (rt->rxq_interrupts)
+	  for (int i = 0; i < n_threads; i++)
 	    {
-	      void *in = rt->rxq_interrupts;
-	      int int_num = -1;
-	      while ((int_num = clib_interrupt_get_next (in, int_num)) != -1)
+	      vlib_main_t *vm = vlib_get_main_by_index (i);
+	      vnet_hw_if_rx_node_runtime_t *rt;
+	      rt = vlib_node_get_runtime_data (vm, node_index);
+	      pv = rt->rxq_poll_vector;
+	      rt->rxq_poll_vector = d[i];
+	      d[i] = pv;
+
+	      if (rt->rxq_interrupts)
 		{
-		  clib_interrupt_clear (in, int_num);
-		  pending_int = clib_bitmap_set (pending_int, int_num, 1);
-		  last_int = clib_max (last_int, int_num);
+		  void *in = rt->rxq_interrupts;
+		  int int_num = -1;
+		  while ((int_num = clib_interrupt_get_next (in, int_num)) !=
+			 -1)
+		    {
+		      clib_interrupt_clear (in, int_num);
+		      pending_int = clib_bitmap_set (pending_int, int_num, 1);
+		      last_int = clib_max (last_int, int_num);
+		    }
 		}
+
+	      vlib_node_set_state (vm, node_index, per_thread_node_state[i]);
+	      vlib_node_set_flag (vm, node_index, VLIB_NODE_FLAG_ADAPTIVE_MODE,
+				  per_thread_node_adaptive[i]);
+
+	      if (last_int >= 0)
+		clib_interrupt_resize (&rt->rxq_interrupts, last_int + 1);
+	      else
+		clib_interrupt_free (&rt->rxq_interrupts);
 	    }
-
-	  vlib_node_set_state (vm, node_index, per_thread_node_state[i]);
-	  vlib_node_set_flag (vm, node_index, VLIB_NODE_FLAG_ADAPTIVE_MODE,
-			      per_thread_node_adaptive[i]);
-
-	  if (last_int >= 0)
-	    clib_interrupt_resize (&rt->rxq_interrupts, last_int + 1);
-	  else
-	    clib_interrupt_free (&rt->rxq_interrupts);
+	}
+      if (something_changed_on_tx)
+	{
+	  vnet_hw_if_output_node_runtime_t *t;
+	  t = hi->output_node_thread_runtimes;
+	  hi->output_node_thread_runtimes = new_out_runtimes;
+	  new_out_runtimes = t;
 	}
 
       if (with_barrier)
@@ -231,4 +274,5 @@
   vec_free (d);
   vec_free (per_thread_node_state);
   vec_free (per_thread_node_adaptive);
+  vec_free (new_out_runtimes);
 }
diff --git a/src/vnet/interface/rx_queue_funcs.h b/src/vnet/interface/rx_queue_funcs.h
index e1e6c33..26dc1b8 100644
--- a/src/vnet/interface/rx_queue_funcs.h
+++ b/src/vnet/interface/rx_queue_funcs.h
@@ -33,7 +33,6 @@
 						 u32 queue_index);
 void vnet_hw_if_set_rx_queue_thread_index (vnet_main_t *vnm, u32 queue_index,
 					   u32 thread_index);
-void vnet_hw_if_update_runtime_data (vnet_main_t *vnm, u32 hw_if_index);
 void vnet_hw_if_generate_rxq_int_poll_vector (vlib_main_t *vm,
 					      vlib_node_runtime_t *node);
 
diff --git a/src/vnet/interface/tx_queue.c b/src/vnet/interface/tx_queue.c
new file mode 100644
index 0000000..3041a58
--- /dev/null
+++ b/src/vnet/interface/tx_queue.c
@@ -0,0 +1,128 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Cisco Systems, Inc.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/devices/devices.h>
+#include <vnet/interface/tx_queue_funcs.h>
+#include <vlib/unix/unix.h>
+
+VLIB_REGISTER_LOG_CLASS (if_txq_log, static) = {
+  .class_name = "interface",
+  .subclass_name = "tx-queue",
+};
+
+#define log_debug(fmt, ...) vlib_log_debug (if_txq_log.class, fmt, __VA_ARGS__)
+#define log_err(fmt, ...)   vlib_log_err (if_txq_log.class, fmt, __VA_ARGS__)
+
+static u64
+tx_queue_key (u32 hw_if_index, u32 queue_id)
+{
+  return ((u64) hw_if_index << 32) | queue_id;
+}
+
+u32
+vnet_hw_if_get_tx_queue_index_by_id (vnet_main_t *vnm, u32 hw_if_index,
+				     u32 queue_id)
+{
+  vnet_interface_main_t *im = &vnm->interface_main;
+  u64 key = tx_queue_key (hw_if_index, queue_id);
+  uword *p = hash_get_mem (im->txq_index_by_hw_if_index_and_queue_id, &key);
+  return p ? p[0] : ~0;
+}
+
+u32
+vnet_hw_if_register_tx_queue (vnet_main_t *vnm, u32 hw_if_index, u32 queue_id)
+{
+  vnet_interface_main_t *im = &vnm->interface_main;
+  vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, hw_if_index);
+  vnet_hw_if_tx_queue_t *txq;
+  u64 key = tx_queue_key (hw_if_index, queue_id);
+  u32 queue_index;
+
+  if (hash_get_mem (im->txq_index_by_hw_if_index_and_queue_id, &key))
+    clib_panic ("Trying to register already registered queue id (%u) in the "
+		"interface %v\n",
+		queue_id, hi->name);
+
+  pool_get_zero (im->hw_if_tx_queues, txq);
+  queue_index = txq - im->hw_if_tx_queues;
+  vec_add1 (hi->tx_queue_indices, queue_index);
+  hash_set_mem_alloc (&im->txq_index_by_hw_if_index_and_queue_id, &key,
+		      queue_index);
+  txq->hw_if_index = hw_if_index;
+  txq->queue_id = queue_id;
+
+  log_debug ("register: interface %v queue-id %u", hi->name, queue_id);
+
+  return queue_index;
+}
+
+void
+vnet_hw_if_unregister_tx_queue (vnet_main_t *vnm, u32 queue_index)
+{
+  vnet_interface_main_t *im = &vnm->interface_main;
+  vnet_hw_if_tx_queue_t *txq;
+  txq = vnet_hw_if_get_tx_queue (vnm, queue_index);
+  vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, txq->hw_if_index);
+  u64 key;
+
+  key = tx_queue_key (txq->hw_if_index, txq->queue_id);
+  hash_unset_mem_free (&im->txq_index_by_hw_if_index_and_queue_id, &key);
+
+  for (int i = 0; i < vec_len (hi->tx_queue_indices); i++)
+    if (hi->tx_queue_indices[i] == queue_index)
+      {
+	vec_del1 (hi->tx_queue_indices, i);
+	break;
+      }
+
+  log_debug ("unregister: interface %v queue-id %u", hi->name, txq->queue_id);
+  clib_bitmap_free (txq->threads);
+  pool_put_index (im->hw_if_tx_queues, queue_index);
+}
+
+void
+vnet_hw_if_unregister_all_tx_queues (vnet_main_t *vnm, u32 hw_if_index)
+{
+  vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, hw_if_index);
+  vnet_interface_main_t *im = &vnm->interface_main;
+  vnet_hw_if_tx_queue_t *txq;
+  u64 key;
+
+  log_debug ("unregister_all: interface %v", hi->name);
+
+  for (int i = 0; i < vec_len (hi->tx_queue_indices); i++)
+    {
+      txq = vnet_hw_if_get_tx_queue (vnm, hi->tx_queue_indices[i]);
+      key = tx_queue_key (txq->hw_if_index, txq->queue_id);
+      hash_unset_mem_free (&im->txq_index_by_hw_if_index_and_queue_id, &key);
+
+      clib_bitmap_free (txq->threads);
+      pool_put_index (im->hw_if_tx_queues, hi->tx_queue_indices[i]);
+    }
+
+  vec_free (hi->tx_queue_indices);
+}
+
+void
+vnet_hw_if_tx_queue_assign_thread (vnet_main_t *vnm, u32 queue_index,
+				   u32 thread_index)
+{
+  vnet_hw_if_tx_queue_t *txq = vnet_hw_if_get_tx_queue (vnm, queue_index);
+  vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, txq->hw_if_index);
+  txq->threads = clib_bitmap_set (txq->threads, thread_index, 1);
+  log_debug ("assign_thread: interface %v queue-id %u thread %u", hi->name,
+	     txq->queue_id, thread_index);
+}
+
+void
+vnet_hw_if_tx_queue_unassign_thread (vnet_main_t *vnm, u32 queue_index,
+				     u32 thread_index)
+{
+  vnet_hw_if_tx_queue_t *txq = vnet_hw_if_get_tx_queue (vnm, queue_index);
+  vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, txq->hw_if_index);
+  txq->threads = clib_bitmap_set (txq->threads, thread_index, 0);
+  log_debug ("unassign_thread: interface %v queue-id %u thread %u", hi->name,
+	     txq->queue_id, thread_index);
+}
diff --git a/src/vnet/interface/tx_queue_funcs.h b/src/vnet/interface/tx_queue_funcs.h
new file mode 100644
index 0000000..22956a4
--- /dev/null
+++ b/src/vnet/interface/tx_queue_funcs.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Cisco Systems, Inc.
+ */
+
+#include <vnet/vnet.h>
+
+/* funciton declarations */
+
+u32 vnet_hw_if_get_tx_queue_index_by_id (vnet_main_t *vnm, u32 hw_if_index,
+					 u32 queue_id);
+u32 vnet_hw_if_register_tx_queue (vnet_main_t *vnm, u32 hw_if_index,
+				  u32 queue_id);
+void vnet_hw_if_unregister_tx_queue (vnet_main_t *vnm, u32 queue_index);
+void vnet_hw_if_unregister_all_tx_queues (vnet_main_t *vnm, u32 hw_if_index);
+void vnet_hw_if_tx_queue_assign_thread (vnet_main_t *vnm, u32 queue_index,
+					u32 thread_index);
+void vnet_hw_if_tx_queue_unassign_thread (vnet_main_t *vnm, u32 queue_index,
+					  u32 thread_index);
+
+/* inline functions */
+
+static_always_inline vnet_hw_if_tx_queue_t *
+vnet_hw_if_get_tx_queue (vnet_main_t *vnm, u32 queue_index)
+{
+  vnet_interface_main_t *im = &vnm->interface_main;
+  if (pool_is_free_index (im->hw_if_tx_queues, queue_index))
+    return 0;
+  return pool_elt_at_index (im->hw_if_tx_queues, queue_index);
+}
diff --git a/src/vnet/interface_format.c b/src/vnet/interface_format.c
index 86a3d59..f66797c 100644
--- a/src/vnet/interface_format.c
+++ b/src/vnet/interface_format.c
@@ -43,6 +43,7 @@
 #include <vnet/l2/l2_output.h>
 #include <vnet/l2/l2_vtr.h>
 #include <vnet/interface/rx_queue_funcs.h>
+#include <vnet/interface/tx_queue_funcs.h>
 
 u8 *
 format_vtr (u8 * s, va_list * args)
@@ -208,6 +209,20 @@
 	}
     }
 
+  if (vec_len (hi->tx_queue_indices))
+    {
+      s = format (s, "\n%UTX Queues:", format_white_space, indent + 2);
+      s = format (s, "\n%U%-6s%-15s", format_white_space, indent + 4, "queue",
+		  "thread(s)");
+      for (int i = 0; i < vec_len (hi->tx_queue_indices); i++)
+	{
+	  vnet_hw_if_tx_queue_t *txq;
+	  txq = vnet_hw_if_get_tx_queue (vnm, hi->tx_queue_indices[i]);
+	  s = format (s, "\n%U%-6u%U", format_white_space, indent + 4,
+		      txq->queue_id, format_bitmap_list, txq->threads);
+	}
+    }
+
   if (hi->rss_queues)
     {
       s = format (s, "\n%URSS queues: %U", format_white_space, indent + 2,
diff --git a/src/vnet/interface_funcs.h b/src/vnet/interface_funcs.h
index 565dcd5..9bcce3d 100644
--- a/src/vnet/interface_funcs.h
+++ b/src/vnet/interface_funcs.h
@@ -443,6 +443,8 @@
 						vnet_hw_interface_t * hi,
 						clib_bitmap_t * bitmap);
 
+void vnet_hw_if_update_runtime_data (vnet_main_t *vnm, u32 hw_if_index);
+
 /* Formats sw/hw interface. */
 format_function_t format_vnet_hw_interface;
 format_function_t format_vnet_hw_if_rx_mode;
diff --git a/src/vnet/interface_output.c b/src/vnet/interface_output.c
index 7d058c2..4566964 100644
--- a/src/vnet/interface_output.c
+++ b/src/vnet/interface_output.c
@@ -47,6 +47,7 @@
 #include <vnet/feature/feature.h>
 #include <vnet/classify/pcap_classify.h>
 #include <vnet/interface_output.h>
+#include <vppinfra/vector_funcs.h>
 
 typedef struct
 {
@@ -321,6 +322,72 @@
     }
 }
 
+static_always_inline void
+store_tx_frame_scalar_data (vnet_hw_if_output_node_runtime_t *r,
+			    vnet_hw_if_tx_frame_t *tf)
+{
+  if (r)
+    clib_memcpy_fast (tf, &r->frame, sizeof (vnet_hw_if_tx_frame_t));
+}
+
+static_always_inline void
+enqueu_to_tx_node (vlib_main_t *vm, vlib_node_runtime_t *node,
+		   vnet_hw_interface_t *hi, u32 *from, u32 n_vectors)
+{
+  u32 next_index = VNET_INTERFACE_OUTPUT_NEXT_TX;
+  vnet_hw_if_output_node_runtime_t *r = 0;
+  u32 n_free, n_copy, *to;
+  vnet_hw_if_tx_frame_t *tf;
+  vlib_frame_t *f;
+
+  ASSERT (n_vectors <= VLIB_FRAME_SIZE);
+
+  if (hi->output_node_thread_runtimes)
+    r = vec_elt_at_index (hi->output_node_thread_runtimes, vm->thread_index);
+
+  f = vlib_get_next_frame_internal (vm, node, next_index, 0);
+  tf = vlib_frame_scalar_args (f);
+
+  if (f->n_vectors > 0 && (r == 0 || tf->queue_id == r->frame.queue_id))
+    {
+      /* append current next frame */
+      n_free = VLIB_FRAME_SIZE - f->n_vectors;
+      n_copy = clib_min (n_vectors, n_free);
+      n_vectors -= n_copy;
+      to = vlib_frame_vector_args (f);
+      to += f->n_vectors;
+    }
+  else
+    {
+      if (f->n_vectors > 0)
+	{
+	  /* current frame doesn't fit - grab empty one */
+	  f = vlib_get_next_frame_internal (vm, node, next_index, 1);
+	  tf = vlib_frame_scalar_args (f);
+	}
+
+      /* empty frame - store scalar data */
+      store_tx_frame_scalar_data (r, tf);
+      to = vlib_frame_vector_args (f);
+      n_free = VLIB_FRAME_SIZE;
+      n_copy = n_vectors;
+      n_vectors = 0;
+    }
+
+  vlib_buffer_copy_indices (to, from, n_copy);
+  vlib_put_next_frame (vm, node, next_index, n_free - n_copy);
+
+  if (n_vectors == 0)
+    return;
+
+  /* we have more indices to store, take empty frame */
+  from += n_copy;
+  f = vlib_get_next_frame_internal (vm, node, next_index, 1);
+  store_tx_frame_scalar_data (r, vlib_frame_scalar_args (f));
+  vlib_buffer_copy_indices (vlib_frame_vector_args (f), from, n_vectors);
+  vlib_put_next_frame (vm, node, next_index, VLIB_FRAME_SIZE - n_vectors);
+}
+
 VLIB_NODE_FN (vnet_interface_output_node)
 (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
 {
@@ -405,8 +472,16 @@
     n_bytes = vnet_interface_output_node_inline (
       vm, sw_if_index, ccm, bufs, config_index, arc, n_buffers, 1, 1);
 
-  vlib_buffer_enqueue_to_single_next (vm, node, vlib_frame_vector_args (frame),
-				      next_index, frame->n_vectors);
+  from = vlib_frame_vector_args (frame);
+  if (PREDICT_TRUE (next_index == VNET_INTERFACE_OUTPUT_NEXT_TX))
+    {
+      enqueu_to_tx_node (vm, node, hi, from, frame->n_vectors);
+    }
+  else
+    {
+      vlib_buffer_enqueue_to_single_next (vm, node, from, next_index,
+					  frame->n_vectors);
+    }
 
   /* Update main interface stats. */
   vlib_increment_combined_counter (ccm, ti, sw_if_index, n_buffers, n_bytes);
@@ -993,10 +1068,16 @@
 {
   vnet_main_t *vnm = vnet_get_main ();
   vnet_interface_main_t *im = &vnm->interface_main;
+  vnet_hw_if_output_node_runtime_t *r = 0;
+  vnet_hw_interface_t *hi;
+  vnet_hw_if_tx_frame_t *tf;
   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs;
-  u16 nexts[VLIB_FRAME_SIZE], *next = nexts;
-  u32 *from, n_left;
-  u16 *lt = im->if_out_arc_end_next_index_by_sw_if_index;
+  u32 sw_if_indices[VLIB_FRAME_SIZE], *sw_if_index = sw_if_indices;
+  u64 used_elts[VLIB_FRAME_SIZE / 64] = {};
+  u64 mask[VLIB_FRAME_SIZE / 64] = {};
+  u32 *tmp, *from, n_left, n_free, n_comp, *to, swif, off;
+  u16 next_index;
+  vlib_frame_t *f;
 
   from = vlib_frame_vector_args (frame);
   n_left = frame->n_vectors;
@@ -1008,25 +1089,113 @@
       vlib_prefetch_buffer_header (b[5], LOAD);
       vlib_prefetch_buffer_header (b[6], LOAD);
       vlib_prefetch_buffer_header (b[7], LOAD);
-      next[0] = vec_elt (lt, vnet_buffer (b[0])->sw_if_index[VLIB_TX]);
-      next[1] = vec_elt (lt, vnet_buffer (b[1])->sw_if_index[VLIB_TX]);
-      next[2] = vec_elt (lt, vnet_buffer (b[2])->sw_if_index[VLIB_TX]);
-      next[3] = vec_elt (lt, vnet_buffer (b[3])->sw_if_index[VLIB_TX]);
+      sw_if_index[0] = vnet_buffer (b[0])->sw_if_index[VLIB_TX];
+      sw_if_index[1] = vnet_buffer (b[1])->sw_if_index[VLIB_TX];
+      sw_if_index[2] = vnet_buffer (b[2])->sw_if_index[VLIB_TX];
+      sw_if_index[3] = vnet_buffer (b[3])->sw_if_index[VLIB_TX];
 
       b += 4;
-      next += 4;
+      sw_if_index += 4;
       n_left -= 4;
     }
 
   while (n_left)
     {
-      next[0] = vec_elt (lt, vnet_buffer (b[0])->sw_if_index[VLIB_TX]);
+      sw_if_index[0] = vnet_buffer (b[0])->sw_if_index[VLIB_TX];
       b++;
-      next++;
+      sw_if_index++;
       n_left--;
     }
 
-  vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
+  n_left = frame->n_vectors;
+  swif = sw_if_indices[0];
+  off = 0;
+
+  /* a bit ugly but it allows us to reuse stack space for temporary store
+   * which may also improve memory latency */
+  tmp = (u32 *) bufs;
+
+more:
+  next_index = vec_elt (im->if_out_arc_end_next_index_by_sw_if_index, swif);
+  hi = vnet_get_sup_hw_interface (vnm, swif);
+  if (hi->output_node_thread_runtimes)
+    r = vec_elt_at_index (hi->output_node_thread_runtimes, vm->thread_index);
+  f = vlib_get_next_frame_internal (vm, node, next_index, 0);
+  tf = vlib_frame_scalar_args (f);
+
+  if (f->n_vectors > 0 && (r == 0 || r->frame.queue_id == tf->queue_id))
+    {
+      /* append frame */
+      n_free = VLIB_FRAME_SIZE - f->n_vectors;
+      if (n_free >= f->n_vectors)
+	to = (u32 *) vlib_frame_vector_args (f) + f->n_vectors;
+      else
+	to = tmp;
+    }
+  else
+    {
+      if (f->n_vectors > 0)
+	{
+	  /* current frame doesn't fit - grab empty one */
+	  f = vlib_get_next_frame_internal (vm, node, next_index, 1);
+	  tf = vlib_frame_scalar_args (f);
+	}
+
+      /* empty frame - store scalar data */
+      store_tx_frame_scalar_data (r, tf);
+      n_free = VLIB_FRAME_SIZE;
+      to = vlib_frame_vector_args (f);
+    }
+
+  /* compare and compress based on comparison mask */
+  clib_mask_compare_u32 (swif, sw_if_indices, mask, frame->n_vectors);
+  n_comp = clib_compress_u32 (to, from, mask, frame->n_vectors);
+
+  if (tmp != to)
+    {
+      /* indices already written to frame, just close it */
+      vlib_put_next_frame (vm, node, next_index, n_free - n_comp);
+    }
+  else if (n_free >= n_comp)
+    {
+      /* enough space in the existing frame */
+      to = (u32 *) vlib_frame_vector_args (f) + f->n_vectors;
+      vlib_buffer_copy_indices (to, tmp, n_comp);
+      vlib_put_next_frame (vm, node, next_index, n_free - n_comp);
+    }
+  else
+    {
+      /* full frame */
+      to = (u32 *) vlib_frame_vector_args (f) + f->n_vectors;
+      vlib_buffer_copy_indices (to, tmp, n_free);
+      vlib_put_next_frame (vm, node, next_index, 0);
+
+      /* second frame */
+      u32 n_frame2 = n_comp - n_free;
+      f = vlib_get_next_frame_internal (vm, node, next_index, 1);
+      to = vlib_frame_vector_args (f);
+      vlib_buffer_copy_indices (to, tmp + n_free, n_frame2);
+      tf = vlib_frame_scalar_args (f);
+      store_tx_frame_scalar_data (r, tf);
+      vlib_put_next_frame (vm, node, next_index, VLIB_FRAME_SIZE - n_frame2);
+    }
+
+  n_left -= n_comp;
+  if (n_left)
+    {
+      /* store comparison mask so we can find next unused element */
+      for (int i = 0; i < ARRAY_LEN (used_elts); i++)
+	used_elts[i] |= mask[i];
+
+      /* fine first unused sw_if_index by scanning trough used_elts bitmap */
+      while (PREDICT_FALSE (used_elts[off] == ~0))
+	off++;
+
+      swif =
+	sw_if_indices[(off << 6) + count_trailing_zeros (~used_elts[off])];
+      goto more;
+    }
+
   return frame->n_vectors;
 }