perfmon plugin: 2-way parallel stat collection

As a FUD reduction measure, this patch implements 2-way parallel
counter collection. Synthetic stat component counter pairs run at the
same time. Running two counters (of any kind) at the same time
naturally reduces the aggregate time required by an approximate
factor-of-2, depending on whether an even or odd number of stats have
been requested.

I don't completely buy the argument that computing synthetic stats
such as instructions-per-clock will be inaccurate if component counter
values are collected sequentially. Given uniform traffic pattern, it
must make no difference.

As the collection interval increases, the difference between serial
and parallel component counter collection will approach zero, see also
the Central Limit theorem.

Change-Id: I36ebdcf125e8882cca8a1929ec58f17fba1ad8f1
Signed-off-by: Dave Barach <dave@barachs.net>
diff --git a/src/plugins/perfmon/perfmon.c b/src/plugins/perfmon/perfmon.c
index c6a8022..3595557 100644
--- a/src/plugins/perfmon/perfmon.c
+++ b/src/plugins/perfmon/perfmon.c
@@ -157,10 +157,16 @@
   pm->log_class = vlib_log_register_class ("perfmon", 0);
 
   /* Default data collection interval */
-  pm->timeout_interval = 3.0;
-  vec_validate (pm->pm_fds, vec_len (vlib_mains) - 1);
-  vec_validate (pm->perf_event_pages, vec_len (vlib_mains) - 1);
-  vec_validate (pm->rdpmc_indices, vec_len (vlib_mains) - 1);
+  pm->timeout_interval = 2.0;	/* seconds */
+  vec_validate (pm->pm_fds, 1);
+  vec_validate (pm->pm_fds[0], vec_len (vlib_mains) - 1);
+  vec_validate (pm->pm_fds[1], vec_len (vlib_mains) - 1);
+  vec_validate (pm->perf_event_pages, 1);
+  vec_validate (pm->perf_event_pages[0], vec_len (vlib_mains) - 1);
+  vec_validate (pm->perf_event_pages[1], vec_len (vlib_mains) - 1);
+  vec_validate (pm->rdpmc_indices, 1);
+  vec_validate (pm->rdpmc_indices[0], vec_len (vlib_mains) - 1);
+  vec_validate (pm->rdpmc_indices[1], vec_len (vlib_mains) - 1);
   pm->page_size = getpagesize ();
 
   ht = pm->perfmon_table = 0;
@@ -297,10 +303,12 @@
   perfmon_main_t *pm = &perfmon_main;
   unformat_input_t _line_input, *line_input = &_line_input;
   perfmon_event_config_t ec;
+  f64 delay;
   u32 timeout_seconds;
   u32 deadman;
 
-  vec_reset_length (pm->events_to_collect);
+  vec_reset_length (pm->single_events_to_collect);
+  vec_reset_length (pm->paired_events_to_collect);
   pm->ipc_event_index = ~0;
   pm->mispredict_event_index = ~0;
 
@@ -316,28 +324,28 @@
 	  ec.name = "instructions";
 	  ec.pe_type = PERF_TYPE_HARDWARE;
 	  ec.pe_config = PERF_COUNT_HW_INSTRUCTIONS;
-	  pm->ipc_event_index = vec_len (pm->events_to_collect);
-	  vec_add1 (pm->events_to_collect, ec);
+	  pm->ipc_event_index = vec_len (pm->paired_events_to_collect);
+	  vec_add1 (pm->paired_events_to_collect, ec);
 	  ec.name = "cpu-cycles";
 	  ec.pe_type = PERF_TYPE_HARDWARE;
 	  ec.pe_config = PERF_COUNT_HW_CPU_CYCLES;
-	  vec_add1 (pm->events_to_collect, ec);
+	  vec_add1 (pm->paired_events_to_collect, ec);
 	}
       else if (unformat (line_input, "branch-mispredict-rate"))
 	{
 	  ec.name = "branch-misses";
 	  ec.pe_type = PERF_TYPE_HARDWARE;
 	  ec.pe_config = PERF_COUNT_HW_BRANCH_MISSES;
-	  pm->mispredict_event_index = vec_len (pm->events_to_collect);
-	  vec_add1 (pm->events_to_collect, ec);
+	  pm->mispredict_event_index = vec_len (pm->paired_events_to_collect);
+	  vec_add1 (pm->paired_events_to_collect, ec);
 	  ec.name = "branches";
 	  ec.pe_type = PERF_TYPE_HARDWARE;
 	  ec.pe_config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS;
-	  vec_add1 (pm->events_to_collect, ec);
+	  vec_add1 (pm->paired_events_to_collect, ec);
 	}
       else if (unformat (line_input, "%U", unformat_processor_event, pm, &ec))
 	{
-	  vec_add1 (pm->events_to_collect, ec);
+	  vec_add1 (pm->single_events_to_collect, ec);
 	}
 #define _(type,event,str)                       \
       else if (unformat (line_input, str))      \
@@ -345,7 +353,7 @@
           ec.name = str;                        \
           ec.pe_type = type;                    \
           ec.pe_config = event;                 \
-          vec_add1 (pm->events_to_collect, ec); \
+          vec_add1 (pm->single_events_to_collect, ec); \
         }
       foreach_perfmon_event
 #undef _
@@ -354,21 +362,33 @@
 				  format_unformat_error, line_input);
     }
 
-  if (vec_len (pm->events_to_collect) == 0)
+  /* Stick paired events at the front of the (unified) list */
+  if (vec_len (pm->paired_events_to_collect) > 0)
+    {
+      perfmon_event_config_t *tmp;
+      /* first 2n events are pairs... */
+      vec_append (pm->paired_events_to_collect, pm->single_events_to_collect);
+      tmp = pm->single_events_to_collect;
+      pm->single_events_to_collect = pm->paired_events_to_collect;
+      pm->paired_events_to_collect = tmp;
+    }
+
+  if (vec_len (pm->single_events_to_collect) == 0)
     return clib_error_return (0, "no events specified...");
 
+  /* Figure out how long data collection will take */
+  delay =
+    ((f64) vec_len (pm->single_events_to_collect)) * pm->timeout_interval;
+  delay /= 2.0;			/* collect 2 stats at once */
+
   vlib_cli_output (vm, "Start collection for %d events, wait %.2f seconds",
-		   vec_len (pm->events_to_collect),
-		   (f64) (vec_len (pm->events_to_collect))
-		   * pm->timeout_interval);
+		   vec_len (pm->single_events_to_collect), delay);
 
   vlib_process_signal_event (pm->vlib_main, perfmon_periodic_node.index,
 			     PERFMON_START, 0);
 
   /* Coarse-grained wait */
-  vlib_process_suspend (vm,
-			((f64) (vec_len (pm->events_to_collect)
-				* pm->timeout_interval)));
+  vlib_process_suspend (vm, delay);
 
   deadman = 0;
   /* Reasonable to guess that collection may not be quite done... */
@@ -438,7 +458,7 @@
       if (i == pm->ipc_event_index)
 	{
 	  f64 ipc_rate;
-	  ASSERT (i + 1 < vec_len (c->counter_names));
+	  ASSERT ((i + 1) < vec_len (c->counter_names));
 
 	  if (c->counter_values[i + 1] > 0)
 	    ipc_rate = (f64) c->counter_values[i]
diff --git a/src/plugins/perfmon/perfmon.h b/src/plugins/perfmon/perfmon.h
index 47ee471..9663dae 100644
--- a/src/plugins/perfmon/perfmon.h
+++ b/src/plugins/perfmon/perfmon.h
@@ -97,8 +97,11 @@
   perfmon_cpuid_and_table_t *perfmon_tables;
   uword *perfmon_table;
 
-  /* vector of events to collect */
-  perfmon_event_config_t *events_to_collect;
+  /* vector of single events to collect */
+  perfmon_event_config_t *single_events_to_collect;
+
+  /* vector of paired events to collect */
+  perfmon_event_config_t *paired_events_to_collect;
 
   /* Base indices of synthetic event tuples */
   u32 ipc_event_index;
@@ -109,13 +112,14 @@
 
   /* Current event (index) being collected */
   u32 current_event;
-  u32 *rdpmc_indices;
+  int n_active;
+  u32 **rdpmc_indices;
   /* mmap base / size of (mapped) struct perf_event_mmap_page */
-  u8 **perf_event_pages;
+  u8 ***perf_event_pages;
   u32 page_size;
 
   /* Current perf_event file descriptors, per thread */
-  int *pm_fds;
+  int **pm_fds;
 
   /* Logging */
   vlib_log_class_t log_class;
diff --git a/src/plugins/perfmon/perfmon_periodic.c b/src/plugins/perfmon/perfmon_periodic.c
index 4e7e237..ae20ac4 100644
--- a/src/plugins/perfmon/perfmon_periodic.c
+++ b/src/plugins/perfmon/perfmon_periodic.c
@@ -31,22 +31,34 @@
   return ret;
 }
 
-static u64
-read_current_perf_counter (vlib_main_t * vm)
+static void
+read_current_perf_counters (vlib_main_t * vm, u64 * c0, u64 * c1)
 {
-  if (vm->perf_counter_id)
-    return clib_rdpmc (vm->perf_counter_id);
-  else
+  int i;
+  u64 *cc;
+  perfmon_main_t *pm = &perfmon_main;
+  uword my_thread_index = vm->thread_index;
+
+  *c0 = *c1 = 0;
+
+  for (i = 0; i < pm->n_active; i++)
     {
-      u64 sw_value;
-      if (read (vm->perf_counter_fd, &sw_value, sizeof (sw_value)) !=
-	  sizeof (sw_value))
+      cc = (i == 0) ? c0 : c1;
+      if (pm->rdpmc_indices[i][my_thread_index] != ~0)
+	*cc = clib_rdpmc ((int) pm->rdpmc_indices[i][my_thread_index]);
+      else
 	{
-	  clib_unix_warning ("counter read failed, disable collection...");
-	  vm->vlib_node_runtime_perf_counter_cb = 0;
-	  return 0ULL;
+	  u64 sw_value;
+	  if (read (pm->pm_fds[i][my_thread_index], &sw_value,
+		    sizeof (sw_value)) != sizeof (sw_value))
+	    {
+	      clib_unix_warning
+		("counter read failed, disable collection...");
+	      vm->vlib_node_runtime_perf_counter_cb = 0;
+	      return;
+	    }
+	  *cc = sw_value;
 	}
-      return sw_value;
     }
 }
 
@@ -80,9 +92,11 @@
       for (i = 0; i < vec_len (nm->nodes); i++)
 	{
 	  n = nm->nodes[i];
-	  n->stats_total.perf_counter_ticks = 0;
+	  n->stats_total.perf_counter0_ticks = 0;
+	  n->stats_total.perf_counter1_ticks = 0;
 	  n->stats_total.perf_counter_vectors = 0;
-	  n->stats_last_clear.perf_counter_ticks = 0;
+	  n->stats_last_clear.perf_counter0_ticks = 0;
+	  n->stats_last_clear.perf_counter1_ticks = 0;
 	  n->stats_last_clear.perf_counter_vectors = 0;
 	}
     }
@@ -90,7 +104,7 @@
 }
 
 static void
-enable_current_event (perfmon_main_t * pm)
+enable_current_events (perfmon_main_t * pm)
 {
   struct perf_event_attr pe;
   int fd;
@@ -98,91 +112,108 @@
   perfmon_event_config_t *c;
   vlib_main_t *vm = vlib_get_main ();
   u32 my_thread_index = vm->thread_index;
+  u32 index;
+  int i, limit = 1;
 
-  c = vec_elt_at_index (pm->events_to_collect, pm->current_event);
+  if ((pm->current_event + 1) < vec_len (pm->single_events_to_collect))
+    limit = 2;
 
-  memset (&pe, 0, sizeof (struct perf_event_attr));
-  pe.type = c->pe_type;
-  pe.size = sizeof (struct perf_event_attr);
-  pe.config = c->pe_config;
-  pe.disabled = 1;
-  pe.pinned = 1;
-  /*
-   * Note: excluding the kernel makes the
-   * (software) context-switch counter read 0...
-   */
-  if (pe.type != PERF_TYPE_SOFTWARE)
+  for (i = 0; i < limit; i++)
     {
-      /* Exclude kernel and hypervisor */
-      pe.exclude_kernel = 1;
-      pe.exclude_hv = 1;
-    }
+      c = vec_elt_at_index (pm->single_events_to_collect,
+			    pm->current_event + i);
 
-  fd = perf_event_open (&pe, 0, -1, -1, 0);
-  if (fd == -1)
-    {
-      clib_unix_warning ("event open: type %d config %d", c->pe_type,
-			 c->pe_config);
-      return;
-    }
-
-  if (pe.type != PERF_TYPE_SOFTWARE)
-    {
-      p = mmap (0, pm->page_size, PROT_READ, MAP_SHARED, fd, 0);
-      if (p == MAP_FAILED)
+      memset (&pe, 0, sizeof (struct perf_event_attr));
+      pe.type = c->pe_type;
+      pe.size = sizeof (struct perf_event_attr);
+      pe.config = c->pe_config;
+      pe.disabled = 1;
+      pe.pinned = 1;
+      /*
+       * Note: excluding the kernel makes the
+       * (software) context-switch counter read 0...
+       */
+      if (pe.type != PERF_TYPE_SOFTWARE)
 	{
-	  clib_unix_warning ("mmap");
-	  close (fd);
+	  /* Exclude kernel and hypervisor */
+	  pe.exclude_kernel = 1;
+	  pe.exclude_hv = 1;
+	}
+
+      fd = perf_event_open (&pe, 0, -1, -1, 0);
+      if (fd == -1)
+	{
+	  clib_unix_warning ("event open: type %d config %d", c->pe_type,
+			     c->pe_config);
 	  return;
 	}
+
+      if (pe.type != PERF_TYPE_SOFTWARE)
+	{
+	  p = mmap (0, pm->page_size, PROT_READ, MAP_SHARED, fd, 0);
+	  if (p == MAP_FAILED)
+	    {
+	      clib_unix_warning ("mmap");
+	      close (fd);
+	      return;
+	    }
+	}
+      else
+	p = 0;
+
+      /*
+       * Software event counters - and others not capable of being
+       * read via the "rdpmc" instruction - will be read
+       * by system calls.
+       */
+      if (pe.type == PERF_TYPE_SOFTWARE || p->cap_user_rdpmc == 0)
+	index = ~0;
+      else
+	index = p->index - 1;
+
+      if (ioctl (fd, PERF_EVENT_IOC_RESET, 0) < 0)
+	clib_unix_warning ("reset ioctl");
+
+      if (ioctl (fd, PERF_EVENT_IOC_ENABLE, 0) < 0)
+	clib_unix_warning ("enable ioctl");
+
+      pm->rdpmc_indices[i][my_thread_index] = index;
+      pm->perf_event_pages[i][my_thread_index] = (void *) p;
+      pm->pm_fds[i][my_thread_index] = fd;
     }
 
-  if (ioctl (fd, PERF_EVENT_IOC_RESET, 0) < 0)
-    clib_unix_warning ("reset ioctl");
-
-  if (ioctl (fd, PERF_EVENT_IOC_ENABLE, 0) < 0)
-    clib_unix_warning ("enable ioctl");
-
-  /*
-   * Software event counters - and others not capable of being
-   * read via the "rdpmc" instruction - will be read
-   * by system calls.
-   */
-  if (pe.type == PERF_TYPE_SOFTWARE || p->cap_user_rdpmc == 0)
-    pm->rdpmc_indices[my_thread_index] = 0;
-  else				/* use rdpmc instrs */
-    pm->rdpmc_indices[my_thread_index] = p->index - 1;
-  pm->perf_event_pages[my_thread_index] = (void *) p;
-
-  pm->pm_fds[my_thread_index] = fd;
-
+  pm->n_active = i;
   /* Enable the main loop counter snapshot mechanism */
-  vm->perf_counter_id = pm->rdpmc_indices[my_thread_index];
-  vm->perf_counter_fd = fd;
-  vm->vlib_node_runtime_perf_counter_cb = read_current_perf_counter;
+  vm->vlib_node_runtime_perf_counter_cb = read_current_perf_counters;
 }
 
 static void
-disable_event (perfmon_main_t * pm)
+disable_events (perfmon_main_t * pm)
 {
   vlib_main_t *vm = vlib_get_main ();
   u32 my_thread_index = vm->thread_index;
-
-  if (pm->pm_fds[my_thread_index] == 0)
-    return;
+  int i;
 
   /* Stop main loop collection */
   vm->vlib_node_runtime_perf_counter_cb = 0;
 
-  if (ioctl (pm->pm_fds[my_thread_index], PERF_EVENT_IOC_DISABLE, 0) < 0)
-    clib_unix_warning ("disable ioctl");
+  for (i = 0; i < pm->n_active; i++)
+    {
+      if (pm->pm_fds[i][my_thread_index] == 0)
+	continue;
 
-  if (pm->perf_event_pages[my_thread_index])
-    if (munmap (pm->perf_event_pages[my_thread_index], pm->page_size) < 0)
-      clib_unix_warning ("munmap");
+      if (ioctl (pm->pm_fds[i][my_thread_index], PERF_EVENT_IOC_DISABLE, 0) <
+	  0)
+	clib_unix_warning ("disable ioctl");
 
-  (void) close (pm->pm_fds[my_thread_index]);
-  pm->pm_fds[my_thread_index] = 0;
+      if (pm->perf_event_pages[i][my_thread_index])
+	if (munmap (pm->perf_event_pages[i][my_thread_index],
+		    pm->page_size) < 0)
+	  clib_unix_warning ("munmap");
+
+      (void) close (pm->pm_fds[i][my_thread_index]);
+      pm->pm_fds[i][my_thread_index] = 0;
+    }
 }
 
 static void
@@ -190,7 +221,7 @@
 {
   perfmon_main_t *pm = &perfmon_main;
 
-  enable_current_event (pm);
+  enable_current_events (pm);
   vm->worker_thread_main_loop_callback = 0;
 }
 
@@ -198,7 +229,7 @@
 worker_thread_stop_event (vlib_main_t * vm)
 {
   perfmon_main_t *pm = &perfmon_main;
-  disable_event (pm);
+  disable_events (pm);
   vm->worker_thread_main_loop_callback = 0;
 }
 
@@ -207,7 +238,7 @@
 {
   int i;
   pm->current_event = 0;
-  if (vec_len (pm->events_to_collect) == 0)
+  if (vec_len (pm->single_events_to_collect) == 0)
     {
       pm->state = PERFMON_STATE_OFF;
       return;
@@ -216,7 +247,7 @@
   clear_counters (pm);
 
   /* Start collection on this thread */
-  enable_current_event (pm);
+  enable_current_events (pm);
 
   /* And also on worker threads */
   for (i = 1; i < vec_len (vlib_mains); i++)
@@ -231,7 +262,7 @@
 void
 scrape_and_clear_counters (perfmon_main_t * pm)
 {
-  int i, j;
+  int i, j, k;
   vlib_main_t *vm = pm->vlib_main;
   vlib_main_t *stat_vm;
   vlib_node_main_t *nm;
@@ -242,7 +273,6 @@
   perfmon_event_config_t *current_event;
   uword *p;
   u8 *counter_name;
-  u64 counter_value;
   u64 vectors_this_counter;
 
   /* snapshoot the nodes, including pm counters */
@@ -272,17 +302,17 @@
 	  n = nm->nodes[i];
 	  nodes[i] = clib_mem_alloc (sizeof (*n));
 	  clib_memcpy_fast (nodes[i], n, sizeof (*n));
-	  n->stats_total.perf_counter_ticks = 0;
+	  n->stats_total.perf_counter0_ticks = 0;
+	  n->stats_total.perf_counter1_ticks = 0;
 	  n->stats_total.perf_counter_vectors = 0;
-	  n->stats_last_clear.perf_counter_ticks = 0;
+	  n->stats_last_clear.perf_counter0_ticks = 0;
+	  n->stats_last_clear.perf_counter1_ticks = 0;
 	  n->stats_last_clear.perf_counter_vectors = 0;
 	}
     }
 
   vlib_worker_thread_barrier_release (vm);
 
-  current_event = pm->events_to_collect + pm->current_event;
-
   for (j = 0; j < vec_len (vlib_mains); j++)
     {
       stat_vm = vlib_mains[j];
@@ -296,38 +326,69 @@
 	  u8 *capture_name;
 
 	  n = nodes[i];
-	  if (n->stats_total.perf_counter_ticks == 0)
+
+	  if (n->stats_total.perf_counter0_ticks == 0 &&
+	      n->stats_total.perf_counter1_ticks == 0)
+	    goto skip_this_node;
+
+	  for (k = 0; k < 2; k++)
 	    {
-	      clib_mem_free (n);
-	      continue;
+	      u64 counter_value, counter_last_clear;
+
+	      /*
+	       * We collect 2 counters at once, except for the
+	       * last counter when the user asks for an odd number of
+	       * counters
+	       */
+	      if ((pm->current_event + k)
+		  >= vec_len (pm->single_events_to_collect))
+		break;
+
+	      if (k == 0)
+		{
+		  counter_value = n->stats_total.perf_counter0_ticks;
+		  counter_last_clear =
+		    n->stats_last_clear.perf_counter0_ticks;
+		}
+	      else
+		{
+		  counter_value = n->stats_total.perf_counter1_ticks;
+		  counter_last_clear =
+		    n->stats_last_clear.perf_counter1_ticks;
+		}
+
+	      capture_name = format (0, "t%d-%v%c", j, n->name, 0);
+
+	      p = hash_get_mem (pm->capture_by_thread_and_node_name,
+				capture_name);
+
+	      if (p == 0)
+		{
+		  pool_get (pm->capture_pool, c);
+		  memset (c, 0, sizeof (*c));
+		  c->thread_and_node_name = capture_name;
+		  hash_set_mem (pm->capture_by_thread_and_node_name,
+				capture_name, c - pm->capture_pool);
+		}
+	      else
+		{
+		  c = pool_elt_at_index (pm->capture_pool, p[0]);
+		  vec_free (capture_name);
+		}
+
+	      /* Snapshoot counters, etc. into the capture */
+	      current_event = pm->single_events_to_collect
+		+ pm->current_event + k;
+	      counter_name = (u8 *) current_event->name;
+	      vectors_this_counter = n->stats_total.perf_counter_vectors -
+		n->stats_last_clear.perf_counter_vectors;
+
+	      vec_add1 (c->counter_names, counter_name);
+	      vec_add1 (c->counter_values,
+			counter_value - counter_last_clear);
+	      vec_add1 (c->vectors_this_counter, vectors_this_counter);
 	    }
-
-	  capture_name = format (0, "t%d-%v%c", j, n->name, 0);
-
-	  p = hash_get_mem (pm->capture_by_thread_and_node_name,
-			    capture_name);
-
-	  if (p == 0)
-	    {
-	      pool_get (pm->capture_pool, c);
-	      memset (c, 0, sizeof (*c));
-	      c->thread_and_node_name = capture_name;
-	      hash_set_mem (pm->capture_by_thread_and_node_name,
-			    capture_name, c - pm->capture_pool);
-	    }
-	  else
-	    c = pool_elt_at_index (pm->capture_pool, p[0]);
-
-	  /* Snapshoot counters, etc. into the capture */
-	  counter_name = (u8 *) current_event->name;
-	  counter_value = n->stats_total.perf_counter_ticks -
-	    n->stats_last_clear.perf_counter_ticks;
-	  vectors_this_counter = n->stats_total.perf_counter_vectors -
-	    n->stats_last_clear.perf_counter_vectors;
-
-	  vec_add1 (c->counter_names, counter_name);
-	  vec_add1 (c->counter_values, counter_value);
-	  vec_add1 (c->vectors_this_counter, vectors_this_counter);
+	skip_this_node:
 	  clib_mem_free (n);
 	}
       vec_free (nodes);
@@ -339,7 +400,7 @@
 handle_timeout (perfmon_main_t * pm, f64 now)
 {
   int i;
-  disable_event (pm);
+  disable_events (pm);
 
   /* And also on worker threads */
   for (i = 1; i < vec_len (vlib_mains); i++)
@@ -354,14 +415,14 @@
   if (i > 1)
     vlib_process_suspend (pm->vlib_main, 1e-3);
   scrape_and_clear_counters (pm);
-  pm->current_event++;
-  if (pm->current_event >= vec_len (pm->events_to_collect))
+  pm->current_event += pm->n_active;
+  if (pm->current_event >= vec_len (pm->single_events_to_collect))
     {
       pm->current_event = 0;
       pm->state = PERFMON_STATE_OFF;
       return;
     }
-  enable_current_event (pm);
+  enable_current_events (pm);
 
   /* And also on worker threads */
   for (i = 1; i < vec_len (vlib_mains); i++)
diff --git a/src/vlib/main.c b/src/vlib/main.c
index 23c4e07..0e480fa 100644
--- a/src/vlib/main.c
+++ b/src/vlib/main.c
@@ -543,15 +543,17 @@
 vlib_node_runtime_sync_stats (vlib_main_t * vm,
 			      vlib_node_runtime_t * r,
 			      uword n_calls, uword n_vectors, uword n_clocks,
-			      uword n_ticks)
+			      uword n_ticks0, uword n_ticks1)
 {
   vlib_node_t *n = vlib_get_node (vm, r->node_index);
 
   n->stats_total.calls += n_calls + r->calls_since_last_overflow;
   n->stats_total.vectors += n_vectors + r->vectors_since_last_overflow;
   n->stats_total.clocks += n_clocks + r->clocks_since_last_overflow;
-  n->stats_total.perf_counter_ticks += n_ticks +
-    r->perf_counter_ticks_since_last_overflow;
+  n->stats_total.perf_counter0_ticks += n_ticks0 +
+    r->perf_counter0_ticks_since_last_overflow;
+  n->stats_total.perf_counter1_ticks += n_ticks1 +
+    r->perf_counter1_ticks_since_last_overflow;
   n->stats_total.perf_counter_vectors += n_vectors +
     r->perf_counter_vectors_since_last_overflow;
   n->stats_total.max_clock = r->max_clock;
@@ -560,7 +562,8 @@
   r->calls_since_last_overflow = 0;
   r->vectors_since_last_overflow = 0;
   r->clocks_since_last_overflow = 0;
-  r->perf_counter_ticks_since_last_overflow = 0ULL;
+  r->perf_counter0_ticks_since_last_overflow = 0ULL;
+  r->perf_counter1_ticks_since_last_overflow = 0ULL;
   r->perf_counter_vectors_since_last_overflow = 0ULL;
 }
 
@@ -568,12 +571,12 @@
 vlib_process_sync_stats (vlib_main_t * vm,
 			 vlib_process_t * p,
 			 uword n_calls, uword n_vectors, uword n_clocks,
-			 uword n_ticks)
+			 uword n_ticks0, uword n_ticks1)
 {
   vlib_node_runtime_t *rt = &p->node_runtime;
   vlib_node_t *n = vlib_get_node (vm, rt->node_index);
   vlib_node_runtime_sync_stats (vm, rt, n_calls, n_vectors, n_clocks,
-				n_ticks);
+				n_ticks0, n_ticks1);
   n->stats_total.suspends += p->n_suspends;
   p->n_suspends = 0;
 }
@@ -599,7 +602,7 @@
       vec_elt_at_index (vm->node_main.nodes_by_type[n->type],
 			n->runtime_index);
 
-  vlib_node_runtime_sync_stats (vm, rt, 0, 0, 0, 0);
+  vlib_node_runtime_sync_stats (vm, rt, 0, 0, 0, 0, 0);
 
   /* Sync up runtime next frame vector counters with main node structure. */
   {
@@ -620,27 +623,30 @@
 				vlib_node_runtime_t * node,
 				uword n_calls,
 				uword n_vectors, uword n_clocks,
-				uword n_ticks)
+				uword n_ticks0, uword n_ticks1)
 {
   u32 ca0, ca1, v0, v1, cl0, cl1, r;
-  u32 ptick0, ptick1, pvec0, pvec1;
+  u32 ptick00, ptick01, ptick10, ptick11, pvec0, pvec1;
 
   cl0 = cl1 = node->clocks_since_last_overflow;
   ca0 = ca1 = node->calls_since_last_overflow;
   v0 = v1 = node->vectors_since_last_overflow;
-  ptick0 = ptick1 = node->perf_counter_ticks_since_last_overflow;
+  ptick00 = ptick01 = node->perf_counter0_ticks_since_last_overflow;
+  ptick10 = ptick11 = node->perf_counter1_ticks_since_last_overflow;
   pvec0 = pvec1 = node->perf_counter_vectors_since_last_overflow;
 
   ca1 = ca0 + n_calls;
   v1 = v0 + n_vectors;
   cl1 = cl0 + n_clocks;
-  ptick1 = ptick0 + n_ticks;
+  ptick01 = ptick00 + n_ticks0;
+  ptick11 = ptick10 + n_ticks1;
   pvec1 = pvec0 + n_vectors;
 
   node->calls_since_last_overflow = ca1;
   node->clocks_since_last_overflow = cl1;
   node->vectors_since_last_overflow = v1;
-  node->perf_counter_ticks_since_last_overflow = ptick1;
+  node->perf_counter0_ticks_since_last_overflow = ptick01;
+  node->perf_counter1_ticks_since_last_overflow = ptick11;
   node->perf_counter_vectors_since_last_overflow = pvec1;
 
   node->max_clock_n = node->max_clock > n_clocks ?
@@ -649,38 +655,39 @@
 
   r = vlib_node_runtime_update_main_loop_vector_stats (vm, node, n_vectors);
 
-  if (PREDICT_FALSE (ca1 < ca0 || v1 < v0 || cl1 < cl0) || (ptick1 < ptick0)
-      || (pvec1 < pvec0))
+  if (PREDICT_FALSE (ca1 < ca0 || v1 < v0 || cl1 < cl0) || (ptick01 < ptick00)
+      || (ptick11 < ptick10) || (pvec1 < pvec0))
     {
       node->calls_since_last_overflow = ca0;
       node->clocks_since_last_overflow = cl0;
       node->vectors_since_last_overflow = v0;
-      node->perf_counter_ticks_since_last_overflow = ptick0;
+      node->perf_counter0_ticks_since_last_overflow = ptick00;
+      node->perf_counter1_ticks_since_last_overflow = ptick10;
       node->perf_counter_vectors_since_last_overflow = pvec0;
 
       vlib_node_runtime_sync_stats (vm, node, n_calls, n_vectors, n_clocks,
-				    n_ticks);
+				    n_ticks0, n_ticks1);
     }
 
   return r;
 }
 
-static inline u64
-vlib_node_runtime_perf_counter (vlib_main_t * vm)
+static inline void
+vlib_node_runtime_perf_counter (vlib_main_t * vm, u64 * pmc0, u64 * pmc1)
 {
+  *pmc0 = 0;
+  *pmc1 = 0;
   if (PREDICT_FALSE (vm->vlib_node_runtime_perf_counter_cb != 0))
-    return ((*vm->vlib_node_runtime_perf_counter_cb) (vm));
-  return 0ULL;
+    (*vm->vlib_node_runtime_perf_counter_cb) (vm, pmc0, pmc1);
 }
 
 always_inline void
 vlib_process_update_stats (vlib_main_t * vm,
 			   vlib_process_t * p,
-			   uword n_calls, uword n_vectors, uword n_clocks,
-			   uword n_ticks)
+			   uword n_calls, uword n_vectors, uword n_clocks)
 {
   vlib_node_runtime_update_stats (vm, &p->node_runtime,
-				  n_calls, n_vectors, n_clocks, n_ticks);
+				  n_calls, n_vectors, n_clocks, 0ULL, 0ULL);
 }
 
 static clib_error_t *
@@ -1098,6 +1105,8 @@
     }
 }
 
+u64 oingo0, oingo1;
+
 static_always_inline u64
 dispatch_node (vlib_main_t * vm,
 	       vlib_node_runtime_t * node,
@@ -1146,18 +1155,14 @@
 
   if (1 /* || vm->thread_index == node->thread_index */ )
     {
-      u64 pmc_before, pmc_delta;
+      u64 pmc_before[2], pmc_after[2], pmc_delta[2];
 
       vlib_elog_main_loop_event (vm, node->node_index,
 				 last_time_stamp,
 				 frame ? frame->n_vectors : 0,
 				 /* is_after */ 0);
 
-      /*
-       * To validate accounting: pmc_before = last_time_stamp
-       * perf ticks should equal clocks/pkt...
-       */
-      pmc_before = vlib_node_runtime_perf_counter (vm);
+      vlib_node_runtime_perf_counter (vm, &pmc_before[0], &pmc_before[1]);
 
       /*
        * Turn this on if you run into
@@ -1191,7 +1196,10 @@
        * To validate accounting: pmc_delta = t - pmc_before;
        * perf ticks should equal clocks/pkt...
        */
-      pmc_delta = vlib_node_runtime_perf_counter (vm) - pmc_before;
+      vlib_node_runtime_perf_counter (vm, &pmc_after[0], &pmc_after[1]);
+
+      pmc_delta[0] = pmc_after[0] - pmc_before[0];
+      pmc_delta[1] = pmc_after[1] - pmc_before[1];
 
       vlib_elog_main_loop_event (vm, node->node_index, t, n,	/* is_after */
 				 1);
@@ -1199,11 +1207,18 @@
       vm->main_loop_vectors_processed += n;
       vm->main_loop_nodes_processed += n > 0;
 
+      if (pmc_delta[0] || pmc_delta[1])
+	{
+	  oingo0 += pmc_delta[0];
+	  oingo1 += pmc_delta[1];
+	}
+
       v = vlib_node_runtime_update_stats (vm, node,
 					  /* n_calls */ 1,
 					  /* n_vectors */ n,
 					  /* n_clocks */ t - last_time_stamp,
-					  pmc_delta /* PMC ticks */ );
+					  pmc_delta[0] /* PMC0 */ ,
+					  pmc_delta[1] /* PMC1 */ );
 
       /* When in interrupt mode and vector rate crosses threshold switch to
          polling mode. */
@@ -1542,8 +1557,7 @@
   vlib_process_update_stats (vm, p,
 			     /* n_calls */ !is_suspend,
 			     /* n_vectors */ n_vectors,
-			     /* n_clocks */ t - last_time_stamp,
-			     /* pmc_ticks */ 0ULL);
+			     /* n_clocks */ t - last_time_stamp);
 
   return t;
 }
@@ -1626,8 +1640,7 @@
   vlib_process_update_stats (vm, p,
 			     /* n_calls */ !is_suspend,
 			     /* n_vectors */ n_vectors,
-			     /* n_clocks */ t - last_time_stamp,
-			     /* pmc_ticks */ 0ULL);
+			     /* n_clocks */ t - last_time_stamp);
 
   return t;
 }
@@ -1677,9 +1690,6 @@
   if (!nm->interrupt_threshold_vector_length)
     nm->interrupt_threshold_vector_length = 5;
 
-  /* Make sure the performance monitor counter is disabled */
-  vm->perf_counter_id = ~0;
-
   /* Start all processes. */
   if (is_main)
     {
diff --git a/src/vlib/main.h b/src/vlib/main.h
index 91661fd..4c6d0f4 100644
--- a/src/vlib/main.h
+++ b/src/vlib/main.h
@@ -84,9 +84,8 @@
   u32 node_counts_per_main_loop[2];
 
   /* Main loop hw / sw performance counters */
-    u64 (*vlib_node_runtime_perf_counter_cb) (struct vlib_main_t *);
-  int perf_counter_id;
-  int perf_counter_fd;
+  void (*vlib_node_runtime_perf_counter_cb) (struct vlib_main_t *,
+					     u64 *, u64 *);
 
   /* Every so often we switch to the next counter. */
 #define VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE 7
diff --git a/src/vlib/node.h b/src/vlib/node.h
index f41eb60..8bb89f4 100644
--- a/src/vlib/node.h
+++ b/src/vlib/node.h
@@ -258,7 +258,8 @@
   u64 calls, vectors, clocks, suspends;
   u64 max_clock;
   u64 max_clock_n;
-  u64 perf_counter_ticks;
+  u64 perf_counter0_ticks;
+  u64 perf_counter1_ticks;
   u64 perf_counter_vectors;
 } vlib_node_stats_t;
 
@@ -507,7 +508,8 @@
   u32 vectors_since_last_overflow;	/**< Number of vector elements
 					  processed by this node. */
 
-  u32 perf_counter_ticks_since_last_overflow; /**< Perf counter ticks */
+  u32 perf_counter0_ticks_since_last_overflow; /**< Perf counter 0 ticks */
+  u32 perf_counter1_ticks_since_last_overflow; /**< Perf counter 1 ticks */
   u32 perf_counter_vectors_since_last_overflow;	/**< Perf counter vectors */
 
   u32 next_frame_index;			/**< Start of next frames for this
diff --git a/src/vlib/node_cli.c b/src/vlib/node_cli.c
index 062854a..ad17c1d 100644
--- a/src/vlib/node_cli.c
+++ b/src/vlib/node_cli.c
@@ -148,8 +148,6 @@
   f64 maxc, maxcn;
   u32 maxn;
   u32 indent;
-  u64 pmc_ticks;
-  f64 pmc_ticks_per_packet;
 
   if (!n)
     {
@@ -163,9 +161,6 @@
 		    "%=30s%=12s%=16s%=16s%=16s%=16s%=16s",
 		    "Name", "State", "Calls", "Vectors", "Suspends",
 		    "Clocks", "Vectors/Call");
-      if (vm->perf_counter_id)
-	s = format (s, "%=16s", "Perf Ticks");
-
       return s;
     }
 
@@ -182,13 +177,6 @@
   else
     maxcn = 0.0;
 
-  pmc_ticks = n->stats_total.perf_counter_ticks -
-    n->stats_last_clear.perf_counter_ticks;
-  if (p > 0)
-    pmc_ticks_per_packet = (f64) pmc_ticks / (f64) p;
-  else
-    pmc_ticks_per_packet = 0.0;
-
   /* Clocks per packet, per call or per suspend. */
   x = 0;
   if (p > 0)
@@ -221,9 +209,6 @@
     s = format (s, "%-30v%=12U%16Ld%16Ld%16Ld%16.2e%16.2f", ns,
 		format_vlib_node_state, vm, n, c, p, d, x, v);
 
-  if (pmc_ticks_per_packet > 0.0)
-    s = format (s, "%16.2e", pmc_ticks_per_packet);
-
   if (ns != n->name)
     vec_free (ns);
 
diff --git a/src/vlibapi/node_serialize.c b/src/vlibapi/node_serialize.c
index 0774eea..b50d79e 100644
--- a/src/vlibapi/node_serialize.c
+++ b/src/vlibapi/node_serialize.c
@@ -57,7 +57,7 @@
   u8 *namep;
   u32 name_bytes;
   uword i, j, k;
-  u64 l, v, c, d, pmc;
+  u64 l, v, c, d;
   state_string_enum_t state_code;
 
   serialize_open_vector (sm, vector);
@@ -77,8 +77,6 @@
 	  v = n->stats_total.vectors - n->stats_last_clear.vectors;
 	  c = n->stats_total.calls - n->stats_last_clear.calls;
 	  d = n->stats_total.suspends - n->stats_last_clear.suspends;
-	  pmc = n->stats_total.perf_counter_ticks
-	    - n->stats_last_clear.perf_counter_ticks;
 
 	  state_code = STATE_INTERNAL;
 
@@ -151,8 +149,6 @@
 	      serialize_integer (sm, v, 8);
 	      /* Total suspends */
 	      serialize_integer (sm, d, 8);
-	      /* PMC counter */
-	      serialize_integer (sm, pmc, 8);
 	    }
 	  else			/* no stats */
 	    serialize_likely_small_unsigned_integer (sm, 0);
@@ -171,7 +167,7 @@
   vlib_node_t **nodes;
   vlib_node_t ***nodes_by_thread = 0;
   int i, j, k;
-  u64 l, v, c, d, pmc;
+  u64 l, v, c, d;
   state_string_enum_t state_code;
   int stats_present;
 
@@ -229,9 +225,6 @@
 	      /* Total suspends */
 	      unserialize_integer (sm, &d, 8);
 	      node->stats_total.suspends = d;
-	      /* PMC counter */
-	      unserialize_integer (sm, &pmc, 8);
-	      node->stats_total.perf_counter_ticks = pmc;
 	    }
 	}
     }