interface: add multi tx-queues support for new tx infra

Type: feature

Change-Id: I231f782b3c56dc2b10321e4569ac7acdad1c11da
Signed-off-by: Mohsin Kazmi <sykazmi@cisco.com>
diff --git a/src/plugins/memif/memif.c b/src/plugins/memif/memif.c
index 3b01819..f8c5191 100644
--- a/src/plugins/memif/memif.c
+++ b/src/plugins/memif/memif.c
@@ -855,10 +855,10 @@
 }
 
 /* *INDENT-OFF* */
-VNET_HW_INTERFACE_CLASS (memif_ip_hw_if_class, static) =
-{
+VNET_HW_INTERFACE_CLASS (memif_ip_hw_if_class, static) = {
   .name = "memif-ip",
   .flags = VNET_HW_INTERFACE_CLASS_FLAG_P2P,
+  .tx_hash_fn_type = VNET_HASH_FN_TYPE_IP,
 };
 /* *INDENT-ON* */
 
diff --git a/src/vnet/devices/tap/tap.c b/src/vnet/devices/tap/tap.c
index 2d075f9..4c0b4e0 100644
--- a/src/vnet/devices/tap/tap.c
+++ b/src/vnet/devices/tap/tap.c
@@ -58,13 +58,11 @@
       goto error; \
     }
 
-  /* *INDENT-OFF* */
-VNET_HW_INTERFACE_CLASS (tun_device_hw_interface_class, static) =
-{
+VNET_HW_INTERFACE_CLASS (tun_device_hw_interface_class, static) = {
   .name = "tun-device",
   .flags = VNET_HW_INTERFACE_CLASS_FLAG_P2P,
+  .tx_hash_fn_type = VNET_HASH_FN_TYPE_IP,
 };
-  /* *INDENT-ON* */
 
 #define TUN_MAX_PACKET_BYTES	 65355
 #define TUN_MIN_PACKET_BYTES	 64
diff --git a/src/vnet/ethernet/interface.c b/src/vnet/ethernet/interface.c
index d287748..c941f82 100644
--- a/src/vnet/ethernet/interface.c
+++ b/src/vnet/ethernet/interface.c
@@ -313,6 +313,7 @@
 /* *INDENT-OFF* */
 VNET_HW_INTERFACE_CLASS (ethernet_hw_interface_class) = {
   .name = "Ethernet",
+  .tx_hash_fn_type = VNET_HASH_FN_TYPE_ETHERNET,
   .format_address = format_ethernet_address,
   .format_header = format_ethernet_header_with_length,
   .unformat_hw_address = unformat_ethernet_address,
diff --git a/src/vnet/interface.api b/src/vnet/interface.api
index d89dea4..172f6af 100644
--- a/src/vnet/interface.api
+++ b/src/vnet/interface.api
@@ -458,6 +458,29 @@
     bool is_main;
 };
 
+/** \brief Set an interface's tx-placement
+    Tx-Queue placement on specific thread is operational for only hardware
+    interface. It will not set queue - thread placement for sub-interfaces,
+    p2p and pipe interfaces.
+    @param client_index - opaque cookie to identify the sender
+    @param context - sender context, to match reply w/ request
+    @param sw_if_index - the interface whose tx-placement will be set
+    @param queue_id - the queue number whose tx-placement will be set.
+    @param array_size - the size of the thread indexes array
+    @param threads - the thread indexes of main and worker(s) threads
+                     whom tx-placement will be at.
+*/
+autoendian autoreply define sw_interface_set_tx_placement
+{
+    u32 client_index;
+    u32 context;
+    vl_api_interface_index_t sw_if_index;
+    u32 queue_id;
+    u32 array_size;
+    u32 threads[array_size];
+    option vat_help = "<interface | sw_if_index <index>> queue <n> [threads <list> | mask <hex>]";
+};
+
 /** \brief Set custom interface name
     Set custom interface name for the interface.
     @param client_index - opaque cookie to identify the sender
@@ -512,6 +535,60 @@
   vl_api_rx_mode_t mode;
 };
 
+service {
+  rpc sw_interface_tx_placement_get returns sw_interface_tx_placement_get_reply
+    stream sw_interface_tx_placement_details;
+};
+
+/** \brief get the tx queue placement of interface(s)
+    @param cursor - optional, it allows client to continue a dump
+    @param sw_if_index - optional interface index for which queue placement to
+      be requested. sw_if_index = ~0 will get the placement information for all
+      interfaces. It will not get information related to sub-interfaces, p2p
+      and pipe interfaces.
+*/
+autoendian define sw_interface_tx_placement_get
+{
+  u32 client_index;
+  u32 context;
+  u32 cursor;
+  vl_api_interface_index_t sw_if_index;
+  option vat_help = "[interface | sw_if_index <index>]";
+};
+
+autoendian define sw_interface_tx_placement_get_reply
+{
+  u32 context;
+  i32 retval;
+  u32 cursor;
+};
+
+/** \brief show the interface's queue - thread placement
+    This api is used to display the interface and queue worker
+    thread placement. One message per tx-queue per interface will
+    be sent to client.
+    Each message will contain information about tx-queue id of an
+    interface, interface index, thread on which this tx-queue is
+    placed and mode of tx-queue.
+    @param client_index - opaque cookie to identify the sender
+    @param context - sender context, to match reply w/ request
+    @param sw_if_index - the interface whose tx-placement will be dumped
+    @param queue_id - the queue id
+    @param shared - the queue is shared on other threads
+    @param array_size - the size of the threads array
+    @param threads - the main and worker(s) thread index(es) whom tx-placement are at.
+*/
+autoendian define sw_interface_tx_placement_details
+{
+  u32 client_index;
+  u32 context;
+  vl_api_interface_index_t sw_if_index;
+  u32 queue_id;
+  u8 shared;
+  u32 array_size;
+  u32 threads[array_size];
+};
+
 /* Gross kludge, DGMS */
 autoreply define interface_name_renumber
 {
diff --git a/src/vnet/interface.c b/src/vnet/interface.c
index ce02499..982abbd 100644
--- a/src/vnet/interface.c
+++ b/src/vnet/interface.c
@@ -863,6 +863,10 @@
   hw->hw_if_index = hw_index;
   hw->default_rx_mode = VNET_HW_IF_RX_MODE_POLLING;
 
+  if (hw_class->tx_hash_fn_type == VNET_HASH_FN_TYPE_ETHERNET ||
+      hw_class->tx_hash_fn_type == VNET_HASH_FN_TYPE_IP)
+    hw->hf = vnet_hash_default_function (hw_class->tx_hash_fn_type);
+
   if (dev_class->format_device_name)
     hw->name = format (0, "%U", dev_class->format_device_name, dev_instance);
   else if (hw_class->format_interface_name)
@@ -1020,6 +1024,7 @@
 	static char *e[] = {
 	  "interface is down",
 	  "interface is deleted",
+	  "no tx queue available",
 	};
 
 	r.n_errors = ARRAY_LEN (e);
diff --git a/src/vnet/interface.h b/src/vnet/interface.h
index fe42e5d..822d4c3 100644
--- a/src/vnet/interface.h
+++ b/src/vnet/interface.h
@@ -44,6 +44,7 @@
 #include <vppinfra/pcap.h>
 #include <vnet/l3_types.h>
 #include <vppinfra/lock.h>
+#include <vnet/hash/hash.h>
 
 struct vnet_main_t;
 struct vnet_hw_interface_t;
@@ -410,6 +411,9 @@
   /* Flags */
   vnet_hw_interface_class_flags_t flags;
 
+  /* tx hash type for interfaces of this hw class */
+  vnet_hash_fn_type_t tx_hash_fn_type;
+
   /* Function to call when hardware interface is added/deleted. */
   vnet_interface_function_t *interface_add_del_function;
 
@@ -641,8 +645,9 @@
 typedef struct
 {
   CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
-  vnet_hw_if_tx_frame_t frame;
-  u32 n_threads;
+  vnet_hw_if_tx_frame_t *frame;
+  u32 *lookup_table;
+  u32 n_queues;
 } vnet_hw_if_output_node_runtime_t;
 
 /* Hardware-interface.  This corresponds to a physical wire
@@ -696,6 +701,9 @@
      used by node function vnet_per_buffer_interface_output() */
   u32 output_node_next_index;
 
+  /* called when hw interface is using transmit side packet steering */
+  vnet_hash_fn_t hf;
+
   /* Maximum transmit rate for this interface in bits/sec. */
   f64 max_rate_bits_per_sec;
 
diff --git a/src/vnet/interface/runtime.c b/src/vnet/interface/runtime.c
index e63f1ec..5c215e8 100644
--- a/src/vnet/interface/runtime.c
+++ b/src/vnet/interface/runtime.c
@@ -184,39 +184,73 @@
 	}
     }
 
-  new_out_runtimes =
-    vec_dup_aligned (hi->output_node_thread_runtimes, CLIB_CACHE_LINE_BYTES);
-  vec_validate_aligned (new_out_runtimes, n_threads - 1,
-			CLIB_CACHE_LINE_BYTES);
-
-  if (vec_len (hi->output_node_thread_runtimes) != vec_len (new_out_runtimes))
-    something_changed_on_tx = 1;
-
-  for (int i = 0; i < vec_len (hi->tx_queue_indices); i++)
+  if (vec_len (hi->tx_queue_indices) > 0)
     {
-      u32 thread_index;
-      u32 queue_index = hi->tx_queue_indices[i];
-      vnet_hw_if_tx_queue_t *txq = vnet_hw_if_get_tx_queue (vnm, queue_index);
-      uword n_threads = clib_bitmap_count_set_bits (txq->threads);
+      new_out_runtimes = vec_dup_aligned (hi->output_node_thread_runtimes,
+					  CLIB_CACHE_LINE_BYTES);
+      vec_validate_aligned (new_out_runtimes, n_threads - 1,
+			    CLIB_CACHE_LINE_BYTES);
 
-      clib_bitmap_foreach (thread_index, txq->threads)
+      for (u32 i = 0; i < vec_len (new_out_runtimes); i++)
 	{
 	  vnet_hw_if_output_node_runtime_t *rt;
-	  rt = vec_elt_at_index (new_out_runtimes, thread_index);
-	  if ((rt->frame.queue_id != txq->queue_id) ||
-	      (rt->n_threads != n_threads))
+	  rt = vec_elt_at_index (new_out_runtimes, i);
+	  u32 n_queues = 0, total_queues = vec_len (hi->tx_queue_indices);
+	  rt->frame = 0;
+	  rt->lookup_table = 0;
+
+	  for (u32 j = 0; j < total_queues; j++)
 	    {
+	      u32 queue_index = hi->tx_queue_indices[j];
+	      vnet_hw_if_tx_frame_t frame = { .shared_queue = 0,
+					      .hints = 7,
+					      .queue_id = ~0 };
+	      vnet_hw_if_tx_queue_t *txq =
+		vnet_hw_if_get_tx_queue (vnm, queue_index);
+	      if (!clib_bitmap_get (txq->threads, i))
+		continue;
+
 	      log_debug ("tx queue data changed for interface %v, thread %u "
-			 "(queue_id %u -> %u, n_threads %u -> %u)",
-			 hi->name, thread_index, rt->frame.queue_id,
-			 txq->queue_id, rt->n_threads, n_threads);
+			 "(queue_id %u)",
+			 hi->name, i, txq->queue_id);
 	      something_changed_on_tx = 1;
-	      rt->frame.queue_id = txq->queue_id;
-	      rt->frame.shared_queue = txq->shared_queue;
-	      rt->n_threads = n_threads;
+
+	      frame.queue_id = txq->queue_id;
+	      frame.shared_queue = txq->shared_queue;
+	      vec_add1 (rt->frame, frame);
+	      n_queues++;
+	    }
+
+	  // don't initialize rt->n_queues above
+	  if (rt->n_queues != n_queues)
+	    {
+	      something_changed_on_tx = 1;
+	      rt->n_queues = n_queues;
+	    }
+	  /*
+	   * It is only used in case of multiple txq.
+	   */
+	  if (rt->n_queues > 0)
+	    {
+	      if (!is_pow2 (n_queues))
+		n_queues = max_pow2 (n_queues);
+
+	      vec_validate_aligned (rt->lookup_table, n_queues - 1,
+				    CLIB_CACHE_LINE_BYTES);
+
+	      for (u32 k = 0; k < vec_len (rt->lookup_table); k++)
+		{
+		  rt->lookup_table[k] = rt->frame[k % rt->n_queues].queue_id;
+		  log_debug ("tx queue lookup table changed for interface %v, "
+			     "(lookup table [%u]=%u)",
+			     hi->name, k, rt->lookup_table[k]);
+		}
 	    }
 	}
     }
+  else
+    /* interface deleted */
+    something_changed_on_tx = 1;
 
   if (something_changed_on_rx || something_changed_on_tx)
     {
@@ -303,6 +337,11 @@
     {
       vec_free (d[i]);
       vec_free (a[i]);
+      if (new_out_runtimes)
+	{
+	  vec_free (new_out_runtimes[i].frame);
+	  vec_free (new_out_runtimes[i].lookup_table);
+	}
     }
 
   vec_free (d);
diff --git a/src/vnet/interface/tx_queue_funcs.h b/src/vnet/interface/tx_queue_funcs.h
index 22956a4..8fcf7c3 100644
--- a/src/vnet/interface/tx_queue_funcs.h
+++ b/src/vnet/interface/tx_queue_funcs.h
@@ -27,3 +27,20 @@
     return 0;
   return pool_elt_at_index (im->hw_if_tx_queues, queue_index);
 }
+
+static_always_inline int
+vnet_hw_if_txq_cmp_cli_api (vnet_hw_if_tx_queue_t **a,
+			    vnet_hw_if_tx_queue_t **b)
+{
+  if (*a == *b)
+    return 0;
+
+  if (a[0]->hw_if_index != b[0]->hw_if_index)
+    return 2 * (a[0]->hw_if_index > b[0]->hw_if_index) - 1;
+
+  if (a[0]->queue_id != b[0]->queue_id)
+    return 2 * (a[0]->queue_id > b[0]->queue_id) - 1;
+
+  ASSERT (0);
+  return ~0;
+}
diff --git a/src/vnet/interface_api.c b/src/vnet/interface_api.c
index a765c0b..5218f74 100644
--- a/src/vnet/interface_api.c
+++ b/src/vnet/interface_api.c
@@ -22,6 +22,7 @@
 
 #include <vnet/interface.h>
 #include <vnet/interface/rx_queue_funcs.h>
+#include <vnet/interface/tx_queue_funcs.h>
 #include <vnet/api_errno.h>
 #include <vnet/ethernet/ethernet.h>
 #include <vnet/ip/ip.h>
@@ -56,7 +57,9 @@
   _ (SW_INTERFACE_ADD_DEL_ADDRESS, sw_interface_add_del_address)              \
   _ (SW_INTERFACE_SET_RX_MODE, sw_interface_set_rx_mode)                      \
   _ (SW_INTERFACE_RX_PLACEMENT_DUMP, sw_interface_rx_placement_dump)          \
+  _ (SW_INTERFACE_TX_PLACEMENT_GET, sw_interface_tx_placement_get)            \
   _ (SW_INTERFACE_SET_RX_PLACEMENT, sw_interface_set_rx_placement)            \
+  _ (SW_INTERFACE_SET_TX_PLACEMENT, sw_interface_set_tx_placement)            \
   _ (SW_INTERFACE_SET_TABLE, sw_interface_set_table)                          \
   _ (SW_INTERFACE_GET_TABLE, sw_interface_get_table)                          \
   _ (SW_INTERFACE_SET_UNNUMBERED, sw_interface_set_unnumbered)                \
@@ -1216,6 +1219,168 @@
 }
 
 static void
+send_interface_tx_placement_details (vnet_hw_if_tx_queue_t **all_queues,
+				     u32 index, vl_api_registration_t *rp,
+				     u32 context)
+{
+  vnet_main_t *vnm = vnet_get_main ();
+  vl_api_sw_interface_tx_placement_details_t *rmp;
+  u32 n_bits = 0, v = ~0;
+  vnet_hw_if_tx_queue_t **q = vec_elt_at_index (all_queues, index);
+  uword *bitmap = q[0]->threads;
+  u32 hw_if_index = q[0]->hw_if_index;
+  vnet_hw_interface_t *hw_if = vnet_get_hw_interface (vnm, hw_if_index);
+
+  n_bits = clib_bitmap_count_set_bits (bitmap);
+  u32 n = n_bits * sizeof (u32);
+
+  /*
+   * FIXME: Use the REPLY_MACRO_DETAILS5_END once endian handler is registered
+   * and available.
+   */
+  REPLY_MACRO_DETAILS5 (
+    VL_API_SW_INTERFACE_TX_PLACEMENT_DETAILS, n, rp, context, ({
+      rmp->sw_if_index = clib_host_to_net_u32 (hw_if->sw_if_index);
+      rmp->queue_id = clib_host_to_net_u32 (q[0]->queue_id);
+      rmp->shared = q[0]->shared_queue;
+      rmp->array_size = clib_host_to_net_u32 (n_bits);
+
+      v = clib_bitmap_first_set (bitmap);
+      for (u32 i = 0; i < n_bits; i++)
+	{
+	  rmp->threads[i] = clib_host_to_net_u32 (v);
+	  v = clib_bitmap_next_set (bitmap, v + 1);
+	}
+    }));
+}
+
+static void
+vl_api_sw_interface_tx_placement_get_t_handler (
+  vl_api_sw_interface_tx_placement_get_t *mp)
+{
+  vnet_main_t *vnm = vnet_get_main ();
+  vl_api_sw_interface_tx_placement_get_reply_t *rmp = 0;
+  vnet_hw_if_tx_queue_t **all_queues = 0;
+  vnet_hw_if_tx_queue_t *q;
+  u32 sw_if_index = mp->sw_if_index;
+  i32 rv = 0;
+
+  if (pool_elts (vnm->interface_main.hw_if_tx_queues) == 0)
+    {
+      rv = VNET_API_ERROR_NO_SUCH_ENTRY;
+      goto err;
+    }
+
+  if (sw_if_index == ~0)
+    {
+      pool_foreach (q, vnm->interface_main.hw_if_tx_queues)
+	vec_add1 (all_queues, q);
+      vec_sort_with_function (all_queues, vnet_hw_if_txq_cmp_cli_api);
+    }
+  else
+    {
+      u32 qi = ~0;
+      vnet_sw_interface_t *si;
+
+      if (!vnet_sw_if_index_is_api_valid (sw_if_index))
+	{
+	  clib_warning ("sw_if_index %u does not exist", sw_if_index);
+	  rv = VNET_API_ERROR_INVALID_SW_IF_INDEX;
+	  goto err;
+	}
+
+      si = vnet_get_sw_interface (vnm, sw_if_index);
+      if (si->type != VNET_SW_INTERFACE_TYPE_HARDWARE)
+	{
+	  clib_warning ("interface type is not HARDWARE! P2P, PIPE and SUB"
+			" interfaces are not supported");
+	  rv = VNET_API_ERROR_INVALID_INTERFACE;
+	  goto err;
+	}
+
+      vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, si->hw_if_index);
+      for (qi = 0; qi < vec_len (hw->tx_queue_indices); qi++)
+	{
+	  q = vnet_hw_if_get_tx_queue (vnm, hw->tx_queue_indices[qi]);
+	  vec_add1 (all_queues, q);
+	}
+    }
+
+  REPLY_AND_DETAILS_VEC_MACRO_END (VL_API_SW_INTERFACE_TX_PLACEMENT_GET_REPLY,
+				   all_queues, mp, rmp, rv, ({
+				     send_interface_tx_placement_details (
+				       all_queues, cursor, rp, mp->context);
+				   }));
+
+  vec_free (all_queues);
+  return;
+
+err:
+  REPLY_MACRO_END (VL_API_SW_INTERFACE_TX_PLACEMENT_GET_REPLY);
+}
+
+static void
+vl_api_sw_interface_set_tx_placement_t_handler (
+  vl_api_sw_interface_set_tx_placement_t *mp)
+{
+  vl_api_sw_interface_set_tx_placement_reply_t *rmp;
+  vnet_main_t *vnm = vnet_get_main ();
+  u32 sw_if_index = mp->sw_if_index;
+  vnet_sw_interface_t *si;
+  uword *bitmap = 0;
+  u32 queue_id = ~0;
+  u32 size = 0;
+  clib_error_t *error = 0;
+  int rv = 0;
+
+  VALIDATE_SW_IF_INDEX_END (mp);
+
+  si = vnet_get_sw_interface (vnm, sw_if_index);
+  if (si->type != VNET_SW_INTERFACE_TYPE_HARDWARE)
+    {
+      rv = VNET_API_ERROR_INVALID_VALUE;
+      goto bad_sw_if_index;
+    }
+
+  size = mp->array_size;
+  for (u32 i = 0; i < size; i++)
+    {
+      u32 thread_index = mp->threads[i];
+      bitmap = clib_bitmap_set (bitmap, thread_index, 1);
+    }
+
+  queue_id = mp->queue_id;
+  rv = set_hw_interface_tx_queue (si->hw_if_index, queue_id, bitmap);
+
+  switch (rv)
+    {
+    case VNET_API_ERROR_INVALID_VALUE:
+      error = clib_error_return (
+	0, "please specify valid thread(s) - last thread index %u",
+	clib_bitmap_last_set (bitmap));
+      break;
+    case VNET_API_ERROR_INVALID_QUEUE:
+      error = clib_error_return (
+	0, "unknown queue %u on interface %s", queue_id,
+	vnet_get_hw_interface (vnet_get_main (), si->hw_if_index)->name);
+      break;
+    default:
+      break;
+    }
+
+  if (error)
+    {
+      clib_error_report (error);
+      goto out;
+    }
+
+  BAD_SW_IF_INDEX_LABEL;
+out:
+  REPLY_MACRO_END (VL_API_SW_INTERFACE_SET_TX_PLACEMENT_REPLY);
+  clib_bitmap_free (bitmap);
+}
+
+static void
 vl_api_create_vlan_subif_t_handler (vl_api_create_vlan_subif_t * mp)
 {
   vl_api_create_vlan_subif_reply_t *rmp;
@@ -1474,6 +1639,10 @@
   /* Do not replay VL_API_SW_INTERFACE_DUMP messages */
   am->api_trace_cfg[VL_API_SW_INTERFACE_DUMP].replay_enable = 0;
 
+  /* Mark these APIs as autoendian */
+  am->is_autoendian[VL_API_SW_INTERFACE_SET_TX_PLACEMENT] = 1;
+  am->is_autoendian[VL_API_SW_INTERFACE_TX_PLACEMENT_GET] = 1;
+
   /*
    * Set up the (msg_name, crc, message-id) table
    */
diff --git a/src/vnet/interface_cli.c b/src/vnet/interface_cli.c
index 4f6f2cf..d2e748a 100644
--- a/src/vnet/interface_cli.c
+++ b/src/vnet/interface_cli.c
@@ -53,6 +53,7 @@
 #include <vnet/classify/vnet_classify.h>
 #include <vnet/interface/rx_queue_funcs.h>
 #include <vnet/interface/tx_queue_funcs.h>
+#include <vnet/hash/hash.h>
 static int
 compare_interface_names (void *a1, void *a2)
 {
@@ -1840,28 +1841,24 @@
 };
 /* *INDENT-ON* */
 
-clib_error_t *
+int
 set_hw_interface_tx_queue (u32 hw_if_index, u32 queue_id, uword *bitmap)
 {
   vnet_main_t *vnm = vnet_get_main ();
-  vnet_device_main_t *vdm = &vnet_device_main;
-  vnet_hw_interface_t *hw;
+  vlib_thread_main_t *vtm = vlib_get_thread_main ();
   vnet_hw_if_tx_queue_t *txq;
   u32 queue_index;
   u32 thread_index;
 
-  hw = vnet_get_hw_interface (vnm, hw_if_index);
-
   /* highest set bit in bitmap should not exceed last worker thread index */
   thread_index = clib_bitmap_last_set (bitmap);
-  if ((thread_index != ~0) && (thread_index > vdm->last_worker_thread_index))
-    return clib_error_return (0, "please specify valid thread(s)");
+  if ((thread_index != ~0) && (thread_index >= vtm->n_vlib_mains))
+    return VNET_API_ERROR_INVALID_VALUE;
 
   queue_index =
     vnet_hw_if_get_tx_queue_index_by_id (vnm, hw_if_index, queue_id);
   if (queue_index == ~0)
-    return clib_error_return (0, "unknown queue %u on interface %s", queue_id,
-			      hw->name);
+    return VNET_API_ERROR_INVALID_QUEUE;
 
   txq = vnet_hw_if_get_tx_queue (vnm, queue_index);
 
@@ -1889,6 +1886,7 @@
   u32 hw_if_index = (u32) ~0;
   u32 queue_id = (u32) 0;
   uword *bitmap = 0;
+  int rv = 0;
 
   if (!unformat_user (input, unformat_line_input, line_input))
     return 0;
@@ -1920,7 +1918,23 @@
       goto error;
     }
 
-  error = set_hw_interface_tx_queue (hw_if_index, queue_id, bitmap);
+  rv = set_hw_interface_tx_queue (hw_if_index, queue_id, bitmap);
+
+  switch (rv)
+    {
+    case VNET_API_ERROR_INVALID_VALUE:
+      error = clib_error_return (
+	0, "please specify valid thread(s) - last thread index %u",
+	clib_bitmap_last_set (bitmap));
+      break;
+    case VNET_API_ERROR_INVALID_QUEUE:
+      error = clib_error_return (
+	0, "unknown queue %u on interface %s", queue_id,
+	vnet_get_hw_interface (vnet_get_main (), hw_if_index)->name);
+      break;
+    default:
+      break;
+    }
 
 error:
   clib_bitmap_free (bitmap);
@@ -2467,6 +2481,132 @@
   .function = set_interface_name,
   .is_mp_safe = 1,
 };
+
+static clib_error_t *
+set_interface_tx_hash_cmd (vlib_main_t *vm, unformat_input_t *input,
+			   vlib_cli_command_t *cmd)
+{
+  clib_error_t *error = 0;
+  unformat_input_t _line_input, *line_input = &_line_input;
+  vnet_main_t *vnm = vnet_get_main ();
+  vnet_hw_interface_t *hi;
+  u8 *hash_name = 0;
+  u32 hw_if_index = (u32) ~0;
+  vnet_hash_fn_t hf;
+  vnet_hash_fn_type_t ftype;
+
+  if (!unformat_user (input, unformat_line_input, line_input))
+    return 0;
+
+  while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (line_input, "%U", unformat_vnet_hw_interface, vnm,
+		    &hw_if_index))
+	;
+      else if (unformat (line_input, "hash-name %s", &hash_name))
+	;
+      else
+	{
+	  error = clib_error_return (0, "parse error: '%U'",
+				     format_unformat_error, line_input);
+	  unformat_free (line_input);
+	  return error;
+	}
+    }
+
+  unformat_free (line_input);
+
+  if (hw_if_index == (u32) ~0)
+    {
+      error = clib_error_return (0, "please specify valid interface name");
+      goto error;
+    }
+
+  hi = vnet_get_hw_interface (vnm, hw_if_index);
+  ftype =
+    vnet_get_hw_interface_class (vnm, hi->hw_class_index)->tx_hash_fn_type;
+  hf = vnet_hash_function_from_name ((const char *) hash_name, ftype);
+
+  if (!hf)
+    {
+      error = clib_error_return (0, "please specify valid hash name");
+      goto error;
+    }
+
+  hi->hf = hf;
+error:
+  vec_free (hash_name);
+  return (error);
+}
+
+VLIB_CLI_COMMAND (cmd_set_if_tx_hash, static) = {
+  .path = "set interface tx-hash",
+  .short_help = "set interface tx-hash <interface> hash-name <hash-name>",
+  .function = set_interface_tx_hash_cmd,
+};
+
+static clib_error_t *
+show_tx_hash (vlib_main_t *vm, unformat_input_t *input,
+	      vlib_cli_command_t *cmd)
+{
+  clib_error_t *error = 0;
+  unformat_input_t _line_input, *line_input = &_line_input;
+  vnet_main_t *vnm = vnet_get_main ();
+  vnet_hw_interface_t *hi;
+  vnet_hash_function_registration_t *hash;
+  u32 hw_if_index = (u32) ~0;
+  vnet_hash_fn_type_t ftype;
+
+  if (!unformat_user (input, unformat_line_input, line_input))
+    return 0;
+
+  while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (line_input, "%U", unformat_vnet_hw_interface, vnm,
+		    &hw_if_index))
+	;
+      else
+	{
+	  error = clib_error_return (0, "parse error: '%U'",
+				     format_unformat_error, line_input);
+	  unformat_free (line_input);
+	  goto error;
+	}
+    }
+
+  unformat_free (line_input);
+
+  if (hw_if_index == (u32) ~0)
+    {
+      error = clib_error_return (0, "please specify valid interface name");
+      goto error;
+    }
+
+  hi = vnet_get_hw_interface (vnm, hw_if_index);
+  ftype =
+    vnet_get_hw_interface_class (vnm, hi->hw_class_index)->tx_hash_fn_type;
+
+  if (hi->hf)
+    {
+      hash = vnet_hash_function_from_func (hi->hf, ftype);
+      if (hash)
+	vlib_cli_output (vm, "%U", format_vnet_hash, hash);
+      else
+	vlib_cli_output (vm, "no matching hash function found");
+    }
+  else
+    vlib_cli_output (vm, "no hashing function set");
+
+error:
+  return (error);
+}
+
+VLIB_CLI_COMMAND (cmd_show_tx_hash, static) = {
+  .path = "show interface tx-hash",
+  .short_help = "show interface tx-hash [interface]",
+  .function = show_tx_hash,
+};
+
 /*
  * fd.io coding-style-patch-verification: ON
  *
diff --git a/src/vnet/interface_format.c b/src/vnet/interface_format.c
index 4acd6ab..cb428e7 100644
--- a/src/vnet/interface_format.c
+++ b/src/vnet/interface_format.c
@@ -212,6 +212,9 @@
   if (vec_len (hi->tx_queue_indices))
     {
       s = format (s, "\n%UTX Queues:", format_white_space, indent + 2);
+      s = format (
+	s, "\n%UTX Hash: %U", format_white_space, indent + 4, format_vnet_hash,
+	vnet_hash_function_from_func (hi->hf, hw_class->tx_hash_fn_type));
       s = format (s, "\n%U%-6s%-7s%-15s", format_white_space, indent + 4,
 		  "queue", "shared", "thread(s)");
       for (int i = 0; i < vec_len (hi->tx_queue_indices); i++)
diff --git a/src/vnet/interface_funcs.h b/src/vnet/interface_funcs.h
index f253b23..eef5596 100644
--- a/src/vnet/interface_funcs.h
+++ b/src/vnet/interface_funcs.h
@@ -427,6 +427,8 @@
 /* Set rx-placement on the interface */
 clib_error_t *set_hw_interface_rx_placement (u32 hw_if_index, u32 queue_id,
 					     u32 thread_index, u8 is_main);
+/* Set tx-queue placement on the interface */
+int set_hw_interface_tx_queue (u32 hw_if_index, u32 queue_id, uword *bitmap);
 
 /* Set the MTU on the HW interface */
 void vnet_hw_interface_set_mtu (vnet_main_t * vnm, u32 hw_if_index, u32 mtu);
@@ -509,6 +511,7 @@
 {
   VNET_INTERFACE_OUTPUT_ERROR_INTERFACE_DOWN,
   VNET_INTERFACE_OUTPUT_ERROR_INTERFACE_DELETED,
+  VNET_INTERFACE_OUTPUT_ERROR_NO_TX_QUEUE,
 } vnet_interface_output_error_t;
 
 /* Format for interface output traces. */
diff --git a/src/vnet/interface_output.c b/src/vnet/interface_output.c
index 72ceb95..659273b 100644
--- a/src/vnet/interface_output.c
+++ b/src/vnet/interface_output.c
@@ -46,10 +46,12 @@
 #include <vnet/udp/udp_packet.h>
 #include <vnet/feature/feature.h>
 #include <vnet/classify/pcap_classify.h>
+#include <vnet/hash/hash.h>
 #include <vnet/interface_output.h>
 #include <vppinfra/vector/mask_compare.h>
 #include <vppinfra/vector/compress.h>
 #include <vppinfra/vector/count_equal.h>
+#include <vppinfra/vector/array_mask.h>
 
 typedef struct
 {
@@ -176,8 +178,9 @@
 static_always_inline uword
 vnet_interface_output_node_inline (vlib_main_t *vm, u32 sw_if_index,
 				   vlib_combined_counter_main_t *ccm,
-				   vlib_buffer_t **b, u32 config_index, u8 arc,
-				   u32 n_left, int processing_level)
+				   vlib_buffer_t **b, void **p,
+				   u32 config_index, u8 arc, u32 n_left,
+				   int processing_level)
 {
   u32 n_bytes = 0;
   u32 n_bytes0, n_bytes1, n_bytes2, n_bytes3;
@@ -208,6 +211,15 @@
       n_bytes += n_bytes2 = vlib_buffer_length_in_chain (vm, b[2]);
       n_bytes += n_bytes3 = vlib_buffer_length_in_chain (vm, b[3]);
 
+      if (processing_level >= 3)
+	{
+	  p[0] = vlib_buffer_get_current (b[0]);
+	  p[1] = vlib_buffer_get_current (b[1]);
+	  p[2] = vlib_buffer_get_current (b[2]);
+	  p[3] = vlib_buffer_get_current (b[3]);
+	  p += 4;
+	}
+
       if (processing_level >= 2)
 	{
 	  u32 tx_swif0, tx_swif1, tx_swif2, tx_swif3;
@@ -262,6 +274,12 @@
 
       n_bytes += n_bytes0 = vlib_buffer_length_in_chain (vm, b[0]);
 
+      if (processing_level >= 3)
+	{
+	  p[0] = vlib_buffer_get_current (b[0]);
+	  p += 1;
+	}
+
       if (processing_level >= 2)
 	{
 	  u32 tx_swif0 = vnet_buffer (b[0])->sw_if_index[VLIB_TX];
@@ -344,39 +362,71 @@
 }
 
 static_always_inline void
-store_tx_frame_scalar_data (vnet_hw_if_output_node_runtime_t *r,
-			    vnet_hw_if_tx_frame_t *tf)
+hash_func_with_mask (void **p, u32 *hash, u32 n_packets, u32 *lookup_table,
+		     u32 mask, vnet_hash_fn_t hf)
 {
-  if (r)
-    clib_memcpy_fast (tf, &r->frame, sizeof (vnet_hw_if_tx_frame_t));
+  u32 n_left_from = n_packets;
+
+  hf (p, hash, n_packets);
+
+  clib_array_mask_u32 (hash, mask, n_packets);
+
+  while (n_left_from >= 4)
+    {
+      hash[0] = lookup_table[hash[0]];
+      hash[1] = lookup_table[hash[1]];
+      hash[2] = lookup_table[hash[2]];
+      hash[3] = lookup_table[hash[3]];
+
+      hash += 4;
+      n_left_from -= 4;
+    }
+
+  while (n_left_from > 0)
+    {
+      hash[0] = lookup_table[hash[0]];
+
+      hash += 1;
+      n_left_from -= 1;
+    }
 }
 
 static_always_inline void
-enqueu_to_tx_node (vlib_main_t *vm, vlib_node_runtime_t *node,
-		   vnet_hw_interface_t *hi, u32 *from, u32 n_vectors)
+store_tx_frame_scalar_data (vnet_hw_if_tx_frame_t *copy_frame,
+			    vnet_hw_if_tx_frame_t *tf)
 {
-  u32 next_index = VNET_INTERFACE_OUTPUT_NEXT_TX;
-  vnet_hw_if_output_node_runtime_t *r = 0;
-  u32 n_free, n_copy, *to;
-  vnet_hw_if_tx_frame_t *tf;
+  if (copy_frame)
+    clib_memcpy_fast (tf, copy_frame, sizeof (vnet_hw_if_tx_frame_t));
+}
+
+static_always_inline u32
+enqueue_one_to_tx_node (vlib_main_t *vm, vlib_node_runtime_t *node, u32 *ppqi,
+			u32 *from, vnet_hw_if_tx_frame_t *copy_frame,
+			u32 n_vectors, u32 n_left, u32 next_index)
+{
+  u32 tmp[VLIB_FRAME_SIZE];
+  u64 mask[VLIB_FRAME_SIZE / 64] = {};
   vlib_frame_t *f;
-
-  ASSERT (n_vectors <= VLIB_FRAME_SIZE);
-
-  if (hi->output_node_thread_runtimes)
-    r = vec_elt_at_index (hi->output_node_thread_runtimes, vm->thread_index);
+  vnet_hw_if_tx_frame_t *tf;
+  u32 *to;
+  u32 n_copy = 0, n_free = 0;
 
   f = vlib_get_next_frame_internal (vm, node, next_index, 0);
   tf = vlib_frame_scalar_args (f);
 
-  if (f->n_vectors > 0 && (r == 0 || tf->queue_id == r->frame.queue_id))
+  if (f->n_vectors > 0 &&
+      (!copy_frame || (tf->queue_id == copy_frame->queue_id)))
     {
       /* append current next frame */
       n_free = VLIB_FRAME_SIZE - f->n_vectors;
-      n_copy = clib_min (n_vectors, n_free);
-      n_vectors -= n_copy;
-      to = vlib_frame_vector_args (f);
-      to += f->n_vectors;
+      /*
+       * if frame contains enough space for worst case scenario,
+       * we can avoid use of tmp
+       */
+      if (n_free >= n_left)
+	to = (u32 *) vlib_frame_vector_args (f) + f->n_vectors;
+      else
+	to = tmp;
     }
   else
     {
@@ -388,25 +438,113 @@
 	}
 
       /* empty frame - store scalar data */
-      store_tx_frame_scalar_data (r, tf);
+      store_tx_frame_scalar_data (copy_frame, tf);
       to = vlib_frame_vector_args (f);
       n_free = VLIB_FRAME_SIZE;
-      n_copy = n_vectors;
-      n_vectors = 0;
     }
 
-  vlib_buffer_copy_indices (to, from, n_copy);
-  vlib_put_next_frame (vm, node, next_index, n_free - n_copy);
+  /*
+   * per packet queue id array
+   * compare with given queue_id, if match, copy respective buffer index from
+   * -> to
+   */
+  if (ppqi)
+    {
+      clib_mask_compare_u32 (copy_frame->queue_id, ppqi, mask, n_vectors);
+      n_copy = clib_compress_u32 (to, from, mask, n_vectors);
 
-  if (n_vectors == 0)
-    return;
+      if (n_copy == 0)
+	return n_left;
+    }
+  else
+    {
+      /*
+       * no work required, just copy all buffer indices from -> to
+       */
+      n_copy = n_left;
+      vlib_buffer_copy_indices (to, from, n_copy);
+    }
 
-  /* we have more indices to store, take empty frame */
-  from += n_copy;
-  f = vlib_get_next_frame_internal (vm, node, next_index, 1);
-  store_tx_frame_scalar_data (r, vlib_frame_scalar_args (f));
-  vlib_buffer_copy_indices (vlib_frame_vector_args (f), from, n_vectors);
-  vlib_put_next_frame (vm, node, next_index, VLIB_FRAME_SIZE - n_vectors);
+  if (to != tmp)
+    {
+      /* indices already written to frame, just close it */
+      vlib_put_next_frame (vm, node, next_index, n_free - n_copy);
+    }
+  else if (n_free >= n_copy)
+    {
+      /* enough space in the existing frame */
+      to = (u32 *) vlib_frame_vector_args (f) + f->n_vectors;
+      vlib_buffer_copy_indices (to, tmp, n_copy);
+      vlib_put_next_frame (vm, node, next_index, n_free - n_copy);
+    }
+  else
+    {
+      /* full frame */
+      to = (u32 *) vlib_frame_vector_args (f) + f->n_vectors;
+      vlib_buffer_copy_indices (to, tmp, n_free);
+      vlib_put_next_frame (vm, node, next_index, 0);
+
+      /* second frame */
+      u32 n_2nd_frame = n_copy - n_free;
+      f = vlib_get_next_frame_internal (vm, node, next_index, 1);
+      tf = vlib_frame_scalar_args (f);
+      /* empty frame - store scalar data */
+      store_tx_frame_scalar_data (copy_frame, tf);
+      to = vlib_frame_vector_args (f);
+      vlib_buffer_copy_indices (to, tmp + n_free, n_2nd_frame);
+      vlib_put_next_frame (vm, node, next_index,
+			   VLIB_FRAME_SIZE - n_2nd_frame);
+    }
+
+  return n_left - n_copy;
+}
+
+static_always_inline void
+enqueue_to_tx_node (vlib_main_t *vm, vlib_node_runtime_t *node,
+		    vnet_hw_interface_t *hi, u32 next_index,
+		    vnet_hw_if_output_node_runtime_t *r, u32 *from, void **p,
+		    u32 n_vectors)
+{
+  u32 n_left = n_vectors;
+
+  ASSERT (n_vectors <= VLIB_FRAME_SIZE);
+
+  /*
+   * backward compatible for drivers not integrated with new tx infra.
+   */
+  if (r == 0)
+    {
+      n_left = enqueue_one_to_tx_node (vm, node, NULL, from, NULL, n_vectors,
+				       n_left, next_index);
+    }
+  /*
+   * only 1 tx queue of given interface is available on given thread
+   */
+  else if (r->n_queues == 1)
+    {
+      n_left = enqueue_one_to_tx_node (vm, node, NULL, from, r->frame,
+				       n_vectors, n_left, next_index);
+    }
+  /*
+   * multi tx-queues use case
+   */
+  else if (r->n_queues > 1)
+    {
+      u32 qids[VLIB_FRAME_SIZE];
+
+      hash_func_with_mask (p, qids, n_vectors, r->lookup_table,
+			   vec_len (r->lookup_table) - 1, hi->hf);
+
+      for (u32 i = 0; i < r->n_queues; i++)
+	{
+	  n_left = enqueue_one_to_tx_node (vm, node, qids, from, &r->frame[i],
+					   n_vectors, n_left, next_index);
+	  if (n_left == 0)
+	    break;
+	}
+    }
+  else
+    ASSERT (0);
 }
 
 VLIB_NODE_FN (vnet_interface_output_node)
@@ -418,6 +556,7 @@
   vnet_hw_interface_t *hi;
   vnet_sw_interface_t *si;
   vnet_interface_output_runtime_t *rt = (void *) node->runtime_data;
+  vnet_hw_if_output_node_runtime_t *r = 0;
   vlib_buffer_t *bufs[VLIB_FRAME_SIZE];
   u32 n_bytes, n_buffers = frame->n_vectors;
   u32 config_index = ~0;
@@ -427,6 +566,8 @@
   u8 arc = im->output_feature_arc_index;
   int arc_or_subif = 0;
   int do_tx_offloads = 0;
+  void *ptr[VLIB_FRAME_SIZE], **p = ptr;
+  u8 is_parr = 0;
   u32 *from;
 
   if (node->flags & VLIB_NODE_FLAG_TRACE)
@@ -462,6 +603,27 @@
 	node->node_index, VNET_INTERFACE_OUTPUT_ERROR_INTERFACE_DOWN);
     }
 
+  if (hi->output_node_thread_runtimes)
+    r = vec_elt_at_index (hi->output_node_thread_runtimes, vm->thread_index);
+
+  if (r)
+    {
+      /*
+       * tx queue of given interface is not available on given thread
+       */
+      if (r->n_queues == 0)
+	return vlib_error_drop_buffers (
+	  vm, node, from,
+	  /* buffer stride */ 1, n_buffers, VNET_INTERFACE_OUTPUT_NEXT_DROP,
+	  node->node_index, VNET_INTERFACE_OUTPUT_ERROR_NO_TX_QUEUE);
+      /*
+       * multiple tx queues available on given thread
+       */
+      else if (r->n_queues > 1)
+	/* construct array of pointer */
+	is_parr = 1;
+    }
+
   /* interface-output feature arc handling */
   if (PREDICT_FALSE (vnet_have_features (arc, sw_if_index)))
     {
@@ -482,20 +644,28 @@
       VNET_HW_INTERFACE_CAP_SUPPORTS_TX_CKSUM)
     do_tx_offloads = 1;
 
-  if (do_tx_offloads == 0 && arc_or_subif == 0)
+  // basic processing
+  if (do_tx_offloads == 0 && arc_or_subif == 0 && is_parr == 0)
     n_bytes = vnet_interface_output_node_inline (
-      vm, sw_if_index, ccm, bufs, config_index, arc, n_buffers, 0);
-  else if (do_tx_offloads == 1 && arc_or_subif == 0)
+      vm, sw_if_index, ccm, bufs, NULL, config_index, arc, n_buffers, 0);
+  // basic processing + tx offloads
+  else if (do_tx_offloads == 1 && arc_or_subif == 0 && is_parr == 0)
     n_bytes = vnet_interface_output_node_inline (
-      vm, sw_if_index, ccm, bufs, config_index, arc, n_buffers, 1);
+      vm, sw_if_index, ccm, bufs, NULL, config_index, arc, n_buffers, 1);
+  // basic processing + tx offloads + vlans + arcs
+  else if (do_tx_offloads == 1 && arc_or_subif == 1 && is_parr == 0)
+    n_bytes = vnet_interface_output_node_inline (
+      vm, sw_if_index, ccm, bufs, NULL, config_index, arc, n_buffers, 2);
+  // basic processing + tx offloads + vlans + arcs + multi-txqs
   else
     n_bytes = vnet_interface_output_node_inline (
-      vm, sw_if_index, ccm, bufs, config_index, arc, n_buffers, 2);
+      vm, sw_if_index, ccm, bufs, p, config_index, arc, n_buffers, 3);
 
   from = vlib_frame_vector_args (frame);
   if (PREDICT_TRUE (next_index == VNET_INTERFACE_OUTPUT_NEXT_TX))
     {
-      enqueu_to_tx_node (vm, node, hi, from, frame->n_vectors);
+      enqueue_to_tx_node (vm, node, hi, next_index, r, from, ptr,
+			  frame->n_vectors);
     }
   else
     {
@@ -1087,16 +1257,14 @@
 {
   vnet_main_t *vnm = vnet_get_main ();
   vnet_interface_main_t *im = &vnm->interface_main;
-  vnet_hw_if_output_node_runtime_t *r = 0;
   vnet_hw_interface_t *hi;
-  vnet_hw_if_tx_frame_t *tf;
   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs;
   u32 sw_if_indices[VLIB_FRAME_SIZE], *sw_if_index = sw_if_indices;
   u64 used_elts[VLIB_FRAME_SIZE / 64] = {};
   u64 mask[VLIB_FRAME_SIZE / 64] = {};
-  u32 *tmp, *from, n_left, n_free, n_comp, *to, swif, off;
+  u32 *tmp, *from, n_left, n_comp, n_p_comp, swif, off;
   u16 next_index;
-  vlib_frame_t *f;
+  void *ptr[VLIB_FRAME_SIZE], **p = ptr;
 
   from = vlib_frame_vector_args (frame);
   n_left = frame->n_vectors;
@@ -1108,11 +1276,17 @@
       vlib_prefetch_buffer_header (b[5], LOAD);
       vlib_prefetch_buffer_header (b[6], LOAD);
       vlib_prefetch_buffer_header (b[7], LOAD);
+
+      p[0] = vlib_buffer_get_current (b[0]);
+      p[1] = vlib_buffer_get_current (b[1]);
+      p[2] = vlib_buffer_get_current (b[2]);
+      p[3] = vlib_buffer_get_current (b[3]);
       sw_if_index[0] = vnet_buffer (b[0])->sw_if_index[VLIB_TX];
       sw_if_index[1] = vnet_buffer (b[1])->sw_if_index[VLIB_TX];
       sw_if_index[2] = vnet_buffer (b[2])->sw_if_index[VLIB_TX];
       sw_if_index[3] = vnet_buffer (b[3])->sw_if_index[VLIB_TX];
 
+      p += 4;
       b += 4;
       sw_if_index += 4;
       n_left -= 4;
@@ -1120,7 +1294,9 @@
 
   while (n_left)
     {
+      p[0] = vlib_buffer_get_current (b[0]);
       sw_if_index[0] = vnet_buffer (b[0])->sw_if_index[VLIB_TX];
+      p++;
       b++;
       sw_if_index++;
       n_left--;
@@ -1137,68 +1313,40 @@
 more:
   next_index = vec_elt (im->if_out_arc_end_next_index_by_sw_if_index, swif);
   hi = vnet_get_sup_hw_interface (vnm, swif);
+  vnet_hw_if_output_node_runtime_t *r = 0;
+  void *ptr_tmp[VLIB_FRAME_SIZE], **p_tmp = ptr_tmp;
+
   if (hi->output_node_thread_runtimes)
     r = vec_elt_at_index (hi->output_node_thread_runtimes, vm->thread_index);
-  f = vlib_get_next_frame_internal (vm, node, next_index, 0);
-  tf = vlib_frame_scalar_args (f);
-
-  if (f->n_vectors > 0 && (r == 0 || r->frame.queue_id == tf->queue_id))
-    {
-      /* append frame */
-      n_free = VLIB_FRAME_SIZE - f->n_vectors;
-      if (n_free >= f->n_vectors)
-	to = (u32 *) vlib_frame_vector_args (f) + f->n_vectors;
-      else
-	to = tmp;
-    }
-  else
-    {
-      if (f->n_vectors > 0)
-	{
-	  /* current frame doesn't fit - grab empty one */
-	  f = vlib_get_next_frame_internal (vm, node, next_index, 1);
-	  tf = vlib_frame_scalar_args (f);
-	}
-
-      /* empty frame - store scalar data */
-      store_tx_frame_scalar_data (r, tf);
-      n_free = VLIB_FRAME_SIZE;
-      to = vlib_frame_vector_args (f);
-    }
 
   /* compare and compress based on comparison mask */
   clib_mask_compare_u32 (swif, sw_if_indices, mask, frame->n_vectors);
-  n_comp = clib_compress_u32 (to, from, mask, frame->n_vectors);
+  n_comp = clib_compress_u32 (tmp, from, mask, frame->n_vectors);
 
-  if (tmp != to)
+  /*
+   * tx queue of given interface is not available on given thread
+   */
+  if (r)
     {
-      /* indices already written to frame, just close it */
-      vlib_put_next_frame (vm, node, next_index, n_free - n_comp);
-    }
-  else if (n_free >= n_comp)
-    {
-      /* enough space in the existing frame */
-      to = (u32 *) vlib_frame_vector_args (f) + f->n_vectors;
-      vlib_buffer_copy_indices (to, tmp, n_comp);
-      vlib_put_next_frame (vm, node, next_index, n_free - n_comp);
-    }
-  else
-    {
-      /* full frame */
-      to = (u32 *) vlib_frame_vector_args (f) + f->n_vectors;
-      vlib_buffer_copy_indices (to, tmp, n_free);
-      vlib_put_next_frame (vm, node, next_index, 0);
-
-      /* second frame */
-      u32 n_frame2 = n_comp - n_free;
-      f = vlib_get_next_frame_internal (vm, node, next_index, 1);
-      to = vlib_frame_vector_args (f);
-      vlib_buffer_copy_indices (to, tmp + n_free, n_frame2);
-      tf = vlib_frame_scalar_args (f);
-      store_tx_frame_scalar_data (r, tf);
-      vlib_put_next_frame (vm, node, next_index, VLIB_FRAME_SIZE - n_frame2);
+      if (r->n_queues == 0)
+	{
+	  vlib_error_drop_buffers (
+	    vm, node, tmp,
+	    /* buffer stride */ 1, n_comp, VNET_INTERFACE_OUTPUT_NEXT_DROP,
+	    node->node_index, VNET_INTERFACE_OUTPUT_ERROR_NO_TX_QUEUE);
+	  goto drop;
+	}
+      else if (r->n_queues > 1)
+	{
+	  n_p_comp = clib_compress_u64 ((u64 *) p_tmp, (u64 *) ptr, mask,
+					frame->n_vectors);
+	  ASSERT (n_p_comp == n_comp);
+	}
     }
 
+  enqueue_to_tx_node (vm, node, hi, next_index, r, tmp, ptr_tmp, n_comp);
+
+drop:
   n_left -= n_comp;
   if (n_left)
     {
diff --git a/src/vnet/interface_test.c b/src/vnet/interface_test.c
index 4a1681f..c3ddcd7 100644
--- a/src/vnet/interface_test.c
+++ b/src/vnet/interface_test.c
@@ -570,6 +570,63 @@
 }
 
 static int
+api_sw_interface_set_tx_placement (vat_main_t *vam)
+{
+  unformat_input_t *i = vam->input;
+  vl_api_sw_interface_set_tx_placement_t *mp;
+  u32 sw_if_index;
+  u8 sw_if_index_set = 0;
+  int ret;
+  uword *bitmap = 0;
+  u32 queue_id, n_bits = 0;
+  u32 v;
+
+  /* Parse args required to build the message */
+  while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (i, "queue %d", &queue_id))
+	;
+      else if (unformat (i, "threads %U", unformat_bitmap_list, &bitmap))
+	;
+      else if (unformat (i, "mask %U", unformat_bitmap_mask, &bitmap))
+	;
+      else if (unformat (i, "%U", api_unformat_sw_if_index, vam, &sw_if_index))
+	sw_if_index_set = 1;
+      else if (unformat (i, "sw_if_index %d", &sw_if_index))
+	sw_if_index_set = 1;
+      else
+	break;
+    }
+
+  if (sw_if_index_set == 0)
+    {
+      errmsg ("missing interface name or sw_if_index");
+      return -99;
+    }
+
+  n_bits = clib_bitmap_count_set_bits (bitmap);
+  /* Construct the API message */
+  M2 (SW_INTERFACE_SET_TX_PLACEMENT, mp, sizeof (u32) * n_bits);
+  mp->sw_if_index = htonl (sw_if_index);
+  mp->queue_id = htonl (queue_id);
+  mp->array_size = htonl (n_bits);
+
+  v = clib_bitmap_first_set (bitmap);
+  for (u32 j = 0; j < n_bits; j++)
+    {
+      mp->threads[j] = htonl (v);
+      v = clib_bitmap_next_set (bitmap, v + 1);
+    }
+
+  /* send it... */
+  S (mp);
+  /* Wait for a reply, return the good/bad news... */
+  W (ret);
+  clib_bitmap_free (bitmap);
+  return ret;
+}
+
+static int
 api_interface_name_renumber (vat_main_t *vam)
 {
   unformat_input_t *line_input = vam->input;
@@ -844,6 +901,25 @@
 			   ((mp->mode == 2) ? "interrupt" : "adaptive"));
 }
 
+static __clib_unused void
+vl_api_sw_interface_tx_placement_details_t_handler (
+  vl_api_sw_interface_tx_placement_details_t *mp)
+{
+  vat_main_t *vam = interface_test_main.vat_main;
+  u32 size = ntohl (mp->array_size);
+  uword *bitmap = 0;
+
+  for (u32 i = 0; i < size; i++)
+    {
+      u32 thread_index = ntohl (mp->threads[i]);
+      bitmap = clib_bitmap_set (bitmap, thread_index, 1);
+    }
+
+  print (vam->ofp, "\n%-11d %-6d %-7s %U", ntohl (mp->sw_if_index),
+	 ntohl (mp->queue_id), (mp->shared == 1) ? "yes" : "no",
+	 format_bitmap_list, bitmap);
+}
+
 static void
 vl_api_create_vlan_subif_reply_t_handler (vl_api_create_vlan_subif_reply_t *mp)
 {
@@ -961,6 +1037,52 @@
 }
 
 static int
+api_sw_interface_tx_placement_get (vat_main_t *vam)
+{
+  unformat_input_t *i = vam->input;
+  vl_api_sw_interface_tx_placement_get_t *mp;
+  vl_api_control_ping_t *mp_ping;
+  int ret;
+  u32 sw_if_index;
+  u8 sw_if_index_set = 0;
+
+  while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (i, "%U", api_unformat_sw_if_index, vam, &sw_if_index))
+	sw_if_index_set++;
+      else if (unformat (i, "sw_if_index %d", &sw_if_index))
+	sw_if_index_set++;
+      else
+	break;
+    }
+
+  fformat (vam->ofp, "\n%-11s %-6s %-7s %-11s", "sw_if_index", "queue",
+	   "shared", "threads");
+
+  /* Dump Interface tx placement */
+  M (SW_INTERFACE_TX_PLACEMENT_GET, mp);
+
+  if (sw_if_index_set)
+    mp->sw_if_index = htonl (sw_if_index);
+  else
+    mp->sw_if_index = ~0;
+
+  S (mp);
+
+  /* Use a control ping for synchronization */
+  PING (&interface_test_main, mp_ping);
+  S (mp_ping);
+
+  W (ret);
+  return ret;
+}
+
+static void
+vl_api_sw_interface_tx_placement_get_reply_t_handler ()
+{
+}
+
+static int
 api_sw_interface_clear_stats (vat_main_t *vam)
 {
   unformat_input_t *i = vam->input;
diff --git a/src/vnet/pg/stream.c b/src/vnet/pg/stream.c
index 605567c..2c75c2b 100644
--- a/src/vnet/pg/stream.c
+++ b/src/vnet/pg/stream.c
@@ -245,6 +245,7 @@
   .build_rewrite = NULL,
   //.update_adjacency = gre_update_adj,
   .flags = VNET_HW_INTERFACE_CLASS_FLAG_P2P,
+  .tx_hash_fn_type = VNET_HASH_FN_TYPE_IP,
 };
 
 u32