Stat segment / client: show run" works now

Seems to have minimal-to-zero performance consequences. Data appears
accurate: result match the debug CLI output. Checked at low rates, 27
MPPS sprayed across two worker threads.

Change-Id: I09ede5150b88a91547feeee448a2854997613004
Signed-off-by: Dave Barach <dave@barachs.net>
diff --git a/src/vat/api_format.c b/src/vat/api_format.c
index 635b4ef..b9f0804 100644
--- a/src/vat/api_format.c
+++ b/src/vat/api_format.c
@@ -3088,13 +3088,14 @@
     {
       hash_free (vam->graph_node_index_by_name);
 
-      for (i = 0; i < vec_len (vam->graph_nodes); i++)
+      for (i = 0; i < vec_len (vam->graph_nodes[0]); i++)
 	{
-	  node = vam->graph_nodes[i];
+	  node = vam->graph_nodes[0][i];
 	  vec_free (node->name);
 	  vec_free (node->next_nodes);
 	  vec_free (node);
 	}
+      vec_free (vam->graph_nodes[0]);
       vec_free (vam->graph_nodes);
     }
 
@@ -3102,9 +3103,9 @@
   vam->graph_nodes = vlib_node_unserialize (pvt_copy);
   vec_free (pvt_copy);
 
-  for (i = 0; i < vec_len (vam->graph_nodes); i++)
+  for (i = 0; i < vec_len (vam->graph_nodes[0]); i++)
     {
-      node = vam->graph_nodes[i];
+      node = vam->graph_nodes[0][i];
       hash_set_mem (vam->graph_node_index_by_name, node->name, i);
     }
 }
@@ -23389,15 +23390,15 @@
       return 0;
     }
 
-  for (i = 0; i < vec_len (vam->graph_nodes); i++)
+  for (i = 0; i < vec_len (vam->graph_nodes[0]); i++)
     {
-      node = vam->graph_nodes[i];
+      node = vam->graph_nodes[0][i];
       print (vam->ofp, "[%d] %s", i, node->name);
       for (j = 0; j < vec_len (node->next_nodes); j++)
 	{
 	  if (node->next_nodes[j] != ~0)
 	    {
-	      next_node = vam->graph_nodes[node->next_nodes[j]];
+	      next_node = vam->graph_nodes[0][node->next_nodes[j]];
 	      print (vam->ofp, "  [%d] %s", j, next_node->name);
 	    }
 	}
@@ -23492,13 +23493,13 @@
 	      print (vam->ofp, "%s not found...", node_to_find);
 	      goto out;
 	    }
-	  node = vam->graph_nodes[p[0]];
+	  node = vam->graph_nodes[0][p[0]];
 	  print (vam->ofp, "[%d] %s", p[0], node->name);
 	  for (j = 0; j < vec_len (node->next_nodes); j++)
 	    {
 	      if (node->next_nodes[j] != ~0)
 		{
-		  next_node = vam->graph_nodes[node->next_nodes[j]];
+		  next_node = vam->graph_nodes[0][node->next_nodes[j]];
 		  print (vam->ofp, "  [%d] %s", j, next_node->name);
 		}
 	    }
diff --git a/src/vat/vat.h b/src/vat/vat.h
index 19796b9..c9384a1 100644
--- a/src/vat/vat.h
+++ b/src/vat/vat.h
@@ -129,7 +129,7 @@
 
   /* Graph node table */
   uword *graph_node_index_by_name;
-  vlib_node_t **graph_nodes;
+  vlib_node_t ***graph_nodes;
 
   /* ip tables */
   ip_details_t *ip_details_by_sw_if_index[2];
diff --git a/src/vlib/cli.c b/src/vlib/cli.c
index f684289..ca8d2ab 100644
--- a/src/vlib/cli.c
+++ b/src/vlib/cli.c
@@ -811,7 +811,7 @@
 
   while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
     {
-      if (!unformat (line_input, "%U", unformat_vlib_enable_disable, &enable))
+      if (unformat (line_input, "%U", unformat_vlib_enable_disable, &enable))
 	;
       else if (unformat (line_input, "api-segment"))
 	api_segment = 1;
diff --git a/src/vlib/node.c b/src/vlib/node.c
index cc1732b..805c69e 100644
--- a/src/vlib/node.c
+++ b/src/vlib/node.c
@@ -563,20 +563,20 @@
     }
 }
 
-vlib_node_t ***
-vlib_node_get_nodes (vlib_main_t * vm, u32 max_threads, int include_stats)
+void
+vlib_node_get_nodes (vlib_main_t * vm, u32 max_threads, int include_stats,
+		     int barrier_sync, vlib_node_t **** node_dupsp,
+		     vlib_main_t *** stat_vmsp)
 {
   vlib_node_main_t *nm = &vm->node_main;
   vlib_node_t *n;
-  static vlib_node_t ***node_dups;
+  vlib_node_t ***node_dups = *node_dupsp;
   vlib_node_t **nodes;
-  static vlib_main_t **stat_vms;
+  vlib_main_t **stat_vms = *stat_vmsp;
   vlib_main_t *stat_vm;
   uword i, j;
   u32 threads_to_serialize;
 
-  vec_reset_length (node_dups);
-
   if (vec_len (stat_vms) == 0)
     {
       for (i = 0; i < vec_len (vlib_mains); i++)
@@ -589,11 +589,14 @@
 
   threads_to_serialize = clib_min (max_threads, vec_len (stat_vms));
 
+  vec_validate (node_dups, threads_to_serialize - 1);
+
   /*
    * Barrier sync across stats scraping.
    * Otherwise, the counts will be grossly inaccurate.
    */
-  vlib_worker_thread_barrier_sync (vm);
+  if (barrier_sync)
+    vlib_worker_thread_barrier_sync (vm);
 
   for (j = 0; j < threads_to_serialize; j++)
     {
@@ -609,12 +612,17 @@
 	    }
 	}
 
-      nodes = vec_dup (nm->nodes);
-      vec_add1 (node_dups, nodes);
+      nodes = node_dups[j];
+      vec_validate (nodes, vec_len (nm->nodes) - 1);
+      clib_memcpy (nodes, nm->nodes, vec_len (nm->nodes) * sizeof (nodes[0]));
+      node_dups[j] = nodes;
     }
-  vlib_worker_thread_barrier_release (vm);
 
-  return node_dups;
+  if (barrier_sync)
+    vlib_worker_thread_barrier_release (vm);
+
+  *node_dupsp = node_dups;
+  *stat_vmsp = stat_vms;
 }
 
 clib_error_t *
diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h
index 547f09b..bb302f7 100644
--- a/src/vlib/node_funcs.h
+++ b/src/vlib/node_funcs.h
@@ -1127,8 +1127,10 @@
 /**
  * Get list of nodes
  */
-vlib_node_t ***vlib_node_get_nodes (vlib_main_t * vm, u32 max_threads,
-				    int include_stats);
+void
+vlib_node_get_nodes (vlib_main_t * vm, u32 max_threads, int include_stats,
+		     int barrier_sync, vlib_node_t **** node_dupsp,
+		     vlib_main_t *** stat_vmsp);
 
 /* Query node given name. */
 vlib_node_t *vlib_get_node_by_name (vlib_main_t * vm, u8 * name);
diff --git a/src/vlib/threads.c b/src/vlib/threads.c
index bbe94c7..487c501 100644
--- a/src/vlib/threads.c
+++ b/src/vlib/threads.c
@@ -1492,6 +1492,18 @@
 
 }
 
+void vlib_stat_segment_lock (void) __attribute__ ((weak));
+void
+vlib_stat_segment_lock (void)
+{
+}
+
+void vlib_stat_segment_unlock (void) __attribute__ ((weak));
+void
+vlib_stat_segment_unlock (void)
+{
+}
+
 void
 vlib_worker_thread_barrier_release (vlib_main_t * vm)
 {
@@ -1521,6 +1533,13 @@
   /* Update (all) node runtimes before releasing the barrier, if needed */
   if (vm->need_vlib_worker_thread_node_runtime_update)
     {
+      /*
+       * Lock stat segment here, so we's safe when
+       * rebuilding the stat segment node clones from the
+       * stat thread...
+       */
+      vlib_stat_segment_lock ();
+
       /* Do stats elements on main thread */
       worker_thread_node_runtime_update_internal ();
       vm->need_vlib_worker_thread_node_runtime_update = 0;
@@ -1562,6 +1581,7 @@
 	      os_panic ();
 	    }
 	}
+      vlib_stat_segment_unlock ();
     }
 
   t_closed_total = now - vm->barrier_epoch;
diff --git a/src/vlibapi/api.h b/src/vlibapi/api.h
index 48c3813..7238a31 100644
--- a/src/vlibapi/api.h
+++ b/src/vlibapi/api.h
@@ -117,7 +117,7 @@
 /* node_serialize.c prototypes */
 u8 *vlib_node_serialize (vlib_main_t * vm, vlib_node_t *** node_dups,
 			 u8 * vector, int include_nexts, int include_stats);
-vlib_node_t **vlib_node_unserialize (u8 * vector);
+vlib_node_t ***vlib_node_unserialize (u8 * vector);
 
 u32 vl_msg_api_get_msg_length (void *msg_arg);
 
diff --git a/src/vlibapi/node_serialize.c b/src/vlibapi/node_serialize.c
index 575de11..b50d79e 100644
--- a/src/vlibapi/node_serialize.c
+++ b/src/vlibapi/node_serialize.c
@@ -124,6 +124,7 @@
 
 	  serialize_likely_small_unsigned_integer (sm, (u64) state_code);
 	  serialize_likely_small_unsigned_integer (sm, n->type);
+	  serialize_likely_small_unsigned_integer (sm, n->flags);
 
 	  if (include_nexts)
 	    {
@@ -152,7 +153,6 @@
 	  else			/* no stats */
 	    serialize_likely_small_unsigned_integer (sm, 0);
 	}
-      vec_free (nodes);
     }
   return (serialize_close_vector (sm));
 }
@@ -197,6 +197,7 @@
 	  node->state_string = (u8 *) state_strings[state_code];
 
 	  node->type = unserialize_likely_small_unsigned_integer (sm);
+	  node->flags = unserialize_likely_small_unsigned_integer (sm);
 	  nnexts = unserialize_likely_small_unsigned_integer (sm);
 	  if (nnexts > 0)
 	    vec_validate (node->next_nodes, nnexts - 1);
diff --git a/src/vpp/api/api.c b/src/vpp/api/api.c
index 8bb11c9..8e24493 100644
--- a/src/vpp/api/api.c
+++ b/src/vpp/api/api.c
@@ -367,7 +367,8 @@
   vlib_main_t *vm = vlib_get_main ();
   void *oldheap;
   vl_api_get_node_graph_reply_t *rmp;
-  vlib_node_t ***node_dups;
+  static vlib_node_t ***node_dups;
+  static vlib_main_t **stat_vms;
 
   pthread_mutex_lock (&am->vlib_rp->mutex);
   oldheap = svm_push_data_heap (am->vlib_rp);
@@ -378,9 +379,10 @@
   vec_validate (vector, 16384);
   vec_reset_length (vector);
 
-  /* $$$$ FIXME */
-  node_dups = vlib_node_get_nodes (vm, (u32) ~ 0 /* all threads */ ,
-				   1 /* include stats */ );
+  vlib_node_get_nodes (vm, 0 /* main threads */ ,
+		       0 /* include stats */ ,
+		       1 /* barrier sync */ ,
+		       &node_dups, &stat_vms);
   vector = vlib_node_serialize (vm, node_dups, vector, 1 /* include nexts */ ,
 				1 /* include stats */ );
 
diff --git a/src/vpp/app/stat_client.c b/src/vpp/app/stat_client.c
index 610a6a5..96c1bda 100644
--- a/src/vpp/app/stat_client.c
+++ b/src/vpp/app/stat_client.c
@@ -144,12 +144,17 @@
 #define foreach_cached_pointer                                          \
 _(vector_rate, SCALAR_POINTER, &stat_client_main.vector_rate_ptr)       \
 _(input_rate, SCALAR_POINTER, &stat_client_main.input_rate_ptr)         \
+_(last_update, SCALAR_POINTER, &stat_client_main.last_runtime_ptr)      \
+_(last_stats_clear, SCALAR_POINTER,                                     \
+  &stat_client_main.last_runtime_stats_clear_ptr)                       \
 _(rx, COUNTER_VECTOR, &stat_client_main.intfc_rx_counters)              \
 _(tx, COUNTER_VECTOR, &stat_client_main.intfc_tx_counters)              \
 _(/err/0/counter_vector, VECTOR_POINTER,                                \
   &stat_client_main.thread_0_error_counts)                              \
 _(/err/IP4 source address matches local interface, ERROR_INDEX,         \
-  &stat_client_main.source_address_match_error_index)
+  &stat_client_main.source_address_match_error_index)                   \
+_(serialized_nodes, SERIALIZED_NODES,                                   \
+  &stat_client_main.serialized_nodes)
 
 typedef struct
 {
@@ -213,9 +218,12 @@
   ssvm_private_t *ssvmp = &sm->stat_segment;
   ssvm_shared_header_t *shared_header;
   vlib_counter_t *thread0_rx_counters = 0, *thread0_tx_counters = 0;
+  vlib_node_t ***nodes_by_thread;
+  vlib_node_t **nodes;
+  vlib_node_t *n;
   f64 vector_rate, input_rate;
   u32 len;
-  int i;
+  int i, j;
   u32 source_address_match_errors;
 
   /* Wait until the stats segment is mapped */
@@ -290,6 +298,106 @@
 
       fformat (stdout, "%lld source address match errors\n",
 	       source_address_match_errors);
+
+      if (sm->serialized_nodes)
+	{
+	  nodes_by_thread = vlib_node_unserialize (sm->serialized_nodes);
+
+	  /* Across all threads... */
+	  for (i = 0; i < vec_len (nodes_by_thread); i++)
+	    {
+	      u64 n_input, n_output, n_drop, n_punt;
+	      u64 n_internal_vectors, n_internal_calls;
+	      u64 n_clocks, l, v, c;
+	      f64 dt;
+
+	      nodes = nodes_by_thread[i];
+
+	      fformat (stdout, "Thread %d -------------------------\n", i);
+
+	      n_input = n_output = n_drop = n_punt = n_clocks = 0;
+	      n_internal_vectors = n_internal_calls = 0;
+
+	      /* Across all nodes */
+	      for (j = 0; j < vec_len (nodes); j++)
+		{
+		  n = nodes[j];
+
+		  /* Exactly stolen from node_cli.c... */
+		  l = n->stats_total.clocks - n->stats_last_clear.clocks;
+		  n_clocks += l;
+
+		  v = n->stats_total.vectors - n->stats_last_clear.vectors;
+		  c = n->stats_total.calls - n->stats_last_clear.calls;
+
+		  switch (n->type)
+		    {
+		    default:
+		      continue;
+
+		    case VLIB_NODE_TYPE_INTERNAL:
+		      n_output +=
+			(n->flags & VLIB_NODE_FLAG_IS_OUTPUT) ? v : 0;
+		      n_drop += (n->flags & VLIB_NODE_FLAG_IS_DROP) ? v : 0;
+		      n_punt += (n->flags & VLIB_NODE_FLAG_IS_PUNT) ? v : 0;
+		      if (!(n->flags & VLIB_NODE_FLAG_IS_OUTPUT))
+			{
+			  n_internal_vectors += v;
+			  n_internal_calls += c;
+			}
+		      if (n->flags & VLIB_NODE_FLAG_IS_HANDOFF)
+			n_input += v;
+		      break;
+
+		    case VLIB_NODE_TYPE_INPUT:
+		      n_input += v;
+		      break;
+		    }
+
+		  if (n->stats_total.calls)
+		    {
+		      fformat (stdout,
+			       "%s (%s): clocks %lld calls %lld vectors %lld ",
+			       n->name,
+			       n->state_string,
+			       n->stats_total.clocks,
+			       n->stats_total.calls, n->stats_total.vectors);
+		      if (n->stats_total.vectors)
+			fformat (stdout, "clocks/pkt %.2f\n",
+				 (f64) n->stats_total.clocks /
+				 (f64) n->stats_total.vectors);
+		      else
+			fformat (stdout, "\n");
+		    }
+		  vec_free (n->name);
+		  vec_free (n->next_nodes);
+		  vec_free (n);
+		}
+
+	      fformat (stdout, "average vectors/node %.2f\n",
+		       (n_internal_calls > 0
+			? (f64) n_internal_vectors / (f64) n_internal_calls
+			: 0));
+
+
+	      dt = *sm->last_runtime_ptr - *sm->last_runtime_stats_clear_ptr;
+
+	      fformat (stdout,
+		       " vectors rates in %.4e, out %.4e, drop %.4e, "
+		       "punt %.4e\n",
+		       (f64) n_input / dt,
+		       (f64) n_output / dt, (f64) n_drop / dt,
+		       (f64) n_punt / dt);
+
+	      vec_free (nodes);
+	    }
+	  vec_free (nodes_by_thread);
+	}
+      else
+	{
+	  fformat (stdout, "serialized nodes NULL?\n");
+	}
+
     }
 }
 
diff --git a/src/vpp/app/stat_client.h b/src/vpp/app/stat_client.h
index 9cec1ee..87e5409 100644
--- a/src/vpp/app/stat_client.h
+++ b/src/vpp/app/stat_client.h
@@ -31,6 +31,8 @@
   /* Cached pointers to scalar quantities, these wont change */
   f64 *vector_rate_ptr;
   f64 *input_rate_ptr;
+  f64 *last_runtime_ptr;
+  f64 *last_runtime_stats_clear_ptr;
 
   volatile int segment_ready;
 
@@ -40,6 +42,7 @@
    */
   vlib_counter_t **intfc_rx_counters;
   vlib_counter_t **intfc_tx_counters;
+  u8 *serialized_nodes;
 
   u64 *thread_0_error_counts;
   u64 source_address_match_error_index;
diff --git a/src/vpp/stats/stat_segment.c b/src/vpp/stats/stat_segment.c
index f1db684..6fb9c83 100644
--- a/src/vpp/stats/stat_segment.c
+++ b/src/vpp/stats/stat_segment.c
@@ -14,6 +14,20 @@
  */
 #include <vpp/stats/stats.h>
 
+void
+vlib_stat_segment_lock (void)
+{
+  stats_main_t *sm = &stats_main;
+  clib_spinlock_lock (sm->stat_segment_lockp);
+}
+
+void
+vlib_stat_segment_unlock (void)
+{
+  stats_main_t *sm = &stats_main;
+  clib_spinlock_unlock (sm->stat_segment_lockp);
+}
+
 void *
 vlib_stats_push_heap (void)
 {
@@ -215,6 +229,8 @@
 					CLIB_CACHE_LINE_BYTES);
   sm->vector_rate_ptr = (scalar_data + 0);
   sm->input_rate_ptr = (scalar_data + 1);
+  sm->last_runtime_ptr = (scalar_data + 2);
+  sm->last_runtime_stats_clear_ptr = (scalar_data + 3);
 
   name = format (0, "vector_rate%c", 0);
   ep = clib_mem_alloc (sizeof (*ep));
@@ -230,6 +246,21 @@
 
   hash_set_mem (sm->counter_vector_by_name, name, ep);
 
+  name = format (0, "last_update%c", 0);
+  ep = clib_mem_alloc (sizeof (*ep));
+  ep->type = STAT_DIR_TYPE_SCALAR_POINTER;
+  ep->value = sm->last_runtime_ptr;
+
+  hash_set_mem (sm->counter_vector_by_name, name, ep);
+
+  name = format (0, "last_stats_clear%c", 0);
+  ep = clib_mem_alloc (sizeof (*ep));
+  ep->type = STAT_DIR_TYPE_SCALAR_POINTER;
+  ep->value = sm->last_runtime_stats_clear_ptr;
+
+  hash_set_mem (sm->counter_vector_by_name, name, ep);
+
+
   /* Publish the hash table */
   shared_header->opaque[STAT_SEGMENT_OPAQUE_DIR] = sm->counter_vector_by_name;
 
@@ -279,6 +310,10 @@
       type_name = "CMainPtr";
       break;
 
+    case STAT_DIR_TYPE_SERIALIZED_NODES:
+      type_name = "SerNodesPtr";
+      break;
+
     case STAT_DIR_TYPE_ERROR_INDEX:
       type_name = "ErrIndex";
       format_string = "%-10s %20lld";
@@ -292,8 +327,6 @@
   return format (s, format_string, type_name, ep->value);
 }
 
-
-
 static clib_error_t *
 show_stat_segment_command_fn (vlib_main_t * vm,
 			      unformat_input_t * input,
@@ -362,62 +395,120 @@
 };
 /* *INDENT-ON* */
 
-static uword
-stat_segment_process (vlib_main_t * vm, vlib_node_runtime_t * rt,
-		      vlib_frame_t * f)
+static inline void
+update_serialized_nodes (stats_main_t * sm)
 {
-  f64 vector_rate;
-  u64 input_packets, last_input_packets;
-  f64 last_runtime, dt, now;
-  vlib_main_t *this_vlib_main;
-  stats_main_t *sm = &stats_main;
   int i;
+  vlib_main_t *vm = vlib_mains[0];
+  ssvm_private_t *ssvmp = &sm->stat_segment;
+  ssvm_shared_header_t *shared_header;
+  void *oldheap;
+  stat_segment_directory_entry_t *ep;
+  hash_pair_t *hp;
+  u8 *name_copy;
 
-  last_runtime = 0.0;
-  last_input_packets = 0;
+  ASSERT (ssvmp && ssvmp->sh);
 
-  last_runtime = 0.0;
-  last_input_packets = 0;
+  vec_reset_length (sm->serialized_nodes);
 
-  while (1)
+  shared_header = ssvmp->sh;
+
+  oldheap = ssvm_push_heap (shared_header);
+
+  clib_spinlock_lock (sm->stat_segment_lockp);
+
+  vlib_node_get_nodes (0 /* vm, for barrier sync */ ,
+		       (u32) ~ 0 /* all threads */ ,
+		       1 /* include stats */ ,
+		       0 /* barrier sync */ ,
+		       &sm->node_dups, &sm->stat_vms);
+
+  sm->serialized_nodes = vlib_node_serialize (vm, sm->node_dups,
+					      sm->serialized_nodes,
+					      0 /* include nexts */ ,
+					      1 /* include stats */ );
+
+  hp = hash_get_pair (sm->counter_vector_by_name, "serialized_nodes");
+  if (hp)
     {
-      vlib_process_suspend (vm, 5.0);
+      name_copy = (u8 *) hp->key;
+      ep = (stat_segment_directory_entry_t *) (hp->value[0]);
 
-      /*
-       * Compute the average vector rate across all workers
-       */
-      vector_rate = 0.0;
+      if (ep->value != sm->serialized_nodes)
+	{
+	  ep->value = sm->serialized_nodes;
+	  /* Warn clients to refresh any pointers they might be holding */
+	  shared_header->opaque[STAT_SEGMENT_OPAQUE_EPOCH] = (void *)
+	    ((u64) shared_header->opaque[STAT_SEGMENT_OPAQUE_EPOCH] + 1);
+	}
+    }
+  else
+    {
+      name_copy = format (0, "%s%c", "serialized_nodes", 0);
+      ep = clib_mem_alloc (sizeof (*ep));
+      ep->type = STAT_DIR_TYPE_SERIALIZED_NODES;
+      ep->value = sm->serialized_nodes;
+      hash_set_mem (sm->counter_vector_by_name, name_copy, ep);
 
-      /* *INDENT-OFF* */
-      for (i = 0; i < vec_len (vlib_mains); i++)
-        {
-          this_vlib_main = vlib_mains[i];
-          vector_rate += vlib_last_vector_length_per_node (vm);
-        }
-      vector_rate /= (f64) i;
+      /* Reset the client hash table pointer */
+      shared_header->opaque[STAT_SEGMENT_OPAQUE_DIR]
+	= sm->counter_vector_by_name;
 
-      /* *INDENT-ON* */
-
-      *sm->vector_rate_ptr = vector_rate / ((f64) vec_len (vlib_mains));
-      now = vlib_time_now (vm);
-      dt = now - last_runtime;
-      input_packets = vnet_get_aggregate_rx_packets ();
-      *sm->input_rate_ptr = (f64) (input_packets - last_input_packets) / dt;
-      last_runtime = now;
-      last_input_packets = input_packets;
+      /* Warn clients to refresh any pointers they might be holding */
+      shared_header->opaque[STAT_SEGMENT_OPAQUE_EPOCH] = (void *)
+	((u64) shared_header->opaque[STAT_SEGMENT_OPAQUE_EPOCH] + 1);
     }
 
-  return 0;			/* not so much */
+  clib_spinlock_unlock (sm->stat_segment_lockp);
+  ssvm_pop_heap (oldheap);
 }
 
-/* *INDENT-OFF* */
-VLIB_REGISTER_NODE (stat_segment_node,static) =
+/*
+ * Called by stats_thread_fn, in stats.c, which runs in a
+ * separate pthread, which won't halt the parade
+ * in single-forwarding-core cases.
+ */
+
+void
+do_stat_segment_updates (stats_main_t * sm)
 {
-  .function = stat_segment_process,
-  .type = VLIB_NODE_TYPE_PROCESS,
-  .name = "stat-segment-process",
-};
-/* *INDENT-ON* */
+  vlib_main_t *vm = vlib_mains[0];
+  f64 vector_rate;
+  u64 input_packets, last_input_packets;
+  f64 dt, now;
+  vlib_main_t *this_vlib_main;
+  int i, start;
+
+  /*
+   * Compute the average vector rate across all workers
+   */
+  vector_rate = 0.0;
+
+  start = vec_len (vlib_mains) > 1 ? 1 : 0;
+
+  for (i = start; i < vec_len (vlib_mains); i++)
+    {
+      this_vlib_main = vlib_mains[i];
+      vector_rate += vlib_last_vector_length_per_node (this_vlib_main);
+    }
+  vector_rate /= (f64) (i - start);
+
+  *sm->vector_rate_ptr = vector_rate / ((f64) (vec_len (vlib_mains) - start));
+
+  /*
+   * Compute the aggregate input rate
+   */
+  now = vlib_time_now (vm);
+  dt = now - sm->last_runtime_ptr[0];
+  input_packets = vnet_get_aggregate_rx_packets ();
+  *sm->input_rate_ptr = (f64) (input_packets - sm->last_input_packets) / dt;
+  sm->last_runtime_ptr[0] = now;
+  sm->last_input_packets = input_packets;
+  sm->last_runtime_stats_clear_ptr[0] =
+    vm->node_main.time_last_runtime_stats_clear;
+
+  update_serialized_nodes (sm);
+}
 
 
 /*
diff --git a/src/vpp/stats/stats.c b/src/vpp/stats/stats.c
index f1c40e6..31cfc33 100644
--- a/src/vpp/stats/stats.c
+++ b/src/vpp/stats/stats.c
@@ -2340,10 +2340,12 @@
       ip46_fib_stats_delay (sm, sm->stats_poll_interval_in_seconds,
 			    0 /* nsec */ );
 
+      /* Always update stats segment data */
+      do_stat_segment_updates (sm);
+
       if (!(sm->enable_poller))
-	{
-	  continue;
-	}
+	continue;
+
       if (pool_elts
 	  (sm->stats_registrations[IDX_PER_INTERFACE_COMBINED_COUNTERS]))
 	do_combined_per_interface_counters (sm);
diff --git a/src/vpp/stats/stats.h b/src/vpp/stats/stats.h
index fd1ab27..262304e 100644
--- a/src/vpp/stats/stats.h
+++ b/src/vpp/stats/stats.h
@@ -163,9 +163,23 @@
   uword *counter_vector_by_name;
   clib_spinlock_t *stat_segment_lockp;
 
-  /* Pointers to scalar stats maintained by the stat segment process */
+  /* Pointers to scalar stats maintained by the stat thread */
   f64 *input_rate_ptr;
+  f64 *last_runtime_ptr;
+  f64 *last_runtime_stats_clear_ptr;
   f64 *vector_rate_ptr;
+  u64 last_input_packets;
+
+  /* Pointers to vector stats maintained by the stat thread */
+  u8 *serialized_nodes;
+  vlib_main_t **stat_vms;
+  vlib_node_t ***node_dups;
+
+  f64 *vectors_per_node;
+  f64 *vector_rate_in;
+  f64 *vector_rate_out;
+  f64 *vector_rate_drop;
+  f64 *vector_rate_punt;
 
   /* convenience */
   vlib_main_t *vlib_main;
@@ -187,6 +201,7 @@
   STAT_DIR_TYPE_VECTOR_POINTER,
   STAT_DIR_TYPE_COUNTER_VECTOR,
   STAT_DIR_TYPE_ERROR_INDEX,
+  STAT_DIR_TYPE_SERIALIZED_NODES,
 } stat_directory_type_t;
 
 typedef struct
@@ -195,6 +210,8 @@
   void *value;
 } stat_segment_directory_entry_t;
 
+void do_stat_segment_updates (stats_main_t * sm);
+
 #endif /* __included_stats_h__ */
 
 /*