64 bit per-thread counters

after:
TenGigabitEthernet5/0/1-output   active             107522        17375708               0          7.22e0          161.60
TenGigabitEthernet5/0/1-tx       active             107522        17375708               0          6.93e1          161.60
ip4-input-no-checksum            active             107522        17375708               0          2.52e1          161.60
ip4-lookup                       active             107522        17375708               0          3.10e1          161.60
ip4-rewrite                      active             107522        17375708               0          2.52e1          161.60

before
TenGigabitEthernet5/0/1-output   active             433575       110995200               0          6.95e0          256.00
TenGigabitEthernet5/0/1-tx       active             433575       110995200               0          7.14e1          256.00
ip4-input-no-checksum            active             433575       110995200               0          2.66e1          256.00
ip4-lookup                       active             433575       110995200               0          3.29e1          256.00
ip4-rewrite                      active             433575       110995200               0          2.59e1          256.00

Change-Id: I46405bd22189f48a39f06e3443bb7e13f410b539
Signed-off-by: Neale Ranns <nranns@cisco.com>
diff --git a/src/vlib/counter.c b/src/vlib/counter.c
index 9f66e04..62f4bd6 100644
--- a/src/vlib/counter.c
+++ b/src/vlib/counter.c
@@ -42,56 +42,36 @@
 void
 vlib_clear_simple_counters (vlib_simple_counter_main_t * cm)
 {
+  counter_t *my_counters;
   uword i, j;
-  u16 *my_minis;
 
-  for (i = 0; i < vec_len (cm->minis); i++)
+  for (i = 0; i < vec_len (cm->counters); i++)
     {
-      my_minis = cm->minis[i];
+      my_counters = cm->counters[i];
 
-      for (j = 0; j < vec_len (my_minis); j++)
+      for (j = 0; j < vec_len (my_counters); j++)
 	{
-	  cm->maxi[j] += my_minis[j];
-	  my_minis[j] = 0;
+	  my_counters[j] = 0;
 	}
     }
-
-  j = vec_len (cm->maxi);
-  if (j > 0)
-    vec_validate (cm->value_at_last_clear, j - 1);
-  for (i = 0; i < j; i++)
-    cm->value_at_last_clear[i] = cm->maxi[i];
 }
 
 void
 vlib_clear_combined_counters (vlib_combined_counter_main_t * cm)
 {
+  vlib_counter_t *my_counters;
   uword i, j;
-  vlib_mini_counter_t *my_minis;
 
-  for (i = 0; i < vec_len (cm->minis); i++)
+  for (i = 0; i < vec_len (cm->counters); i++)
     {
-      my_minis = cm->minis[i];
+      my_counters = cm->counters[i];
 
-      for (j = 0; j < vec_len (my_minis); j++)
+      for (j = 0; j < vec_len (my_counters); j++)
 	{
-	  cm->maxi[j].packets += my_minis[j].packets;
-	  cm->maxi[j].bytes += my_minis[j].bytes;
-	  my_minis[j].packets = 0;
-	  my_minis[j].bytes = 0;
+	  my_counters[j].packets = 0;
+	  my_counters[j].bytes = 0;
 	}
     }
-
-  j = vec_len (cm->maxi);
-  if (j > 0)
-    vec_validate (cm->value_at_last_clear, j - 1);
-
-  for (i = 0; i < j; i++)
-    {
-      vlib_counter_t *c = vec_elt_at_index (cm->value_at_last_clear, i);
-
-      c[0] = cm->maxi[i];
-    }
 }
 
 void
@@ -100,10 +80,9 @@
   vlib_thread_main_t *tm = vlib_get_thread_main ();
   int i;
 
-  vec_validate (cm->minis, tm->n_vlib_mains - 1);
+  vec_validate (cm->counters, tm->n_vlib_mains - 1);
   for (i = 0; i < tm->n_vlib_mains; i++)
-    vec_validate_aligned (cm->minis[i], index, CLIB_CACHE_LINE_BYTES);
-  vec_validate_aligned (cm->maxi, index, CLIB_CACHE_LINE_BYTES);
+    vec_validate_aligned (cm->counters[i], index, CLIB_CACHE_LINE_BYTES);
 }
 
 void
@@ -112,10 +91,23 @@
   vlib_thread_main_t *tm = vlib_get_thread_main ();
   int i;
 
-  vec_validate (cm->minis, tm->n_vlib_mains - 1);
+  vec_validate (cm->counters, tm->n_vlib_mains - 1);
   for (i = 0; i < tm->n_vlib_mains; i++)
-    vec_validate_aligned (cm->minis[i], index, CLIB_CACHE_LINE_BYTES);
-  vec_validate_aligned (cm->maxi, index, CLIB_CACHE_LINE_BYTES);
+    vec_validate_aligned (cm->counters[i], index, CLIB_CACHE_LINE_BYTES);
+}
+
+u32
+vlib_combined_counter_n_counters (const vlib_combined_counter_main_t * cm)
+{
+  ASSERT (cm->counters);
+  return (vec_len (cm->counters[0]));
+}
+
+u32
+vlib_simple_counter_n_counters (const vlib_simple_counter_main_t * cm)
+{
+  ASSERT (cm->counters);
+  return (vec_len (cm->counters[0]));
 }
 
 void
diff --git a/src/vlib/counter.h b/src/vlib/counter.h
index abfa89e..17a8521 100644
--- a/src/vlib/counter.h
+++ b/src/vlib/counter.h
@@ -44,59 +44,48 @@
 
     Optimized thread-safe counters.
 
-    Each vlib_[simple|combined]_counter_main_t consists of a single
-    vector of thread-safe / atomically-updated u64 counters [the
-    "maxi" vector], and a (u16 **) per-thread vector [the "minis"
-    vector] of narrow, per-thread counters.
+    Each vlib_[simple|combined]_counter_main_t consists of a per-thread
+    vector of per-object counters.
 
-    The idea is to drastically reduce the number of atomic operations.
-    In the case of packet counts, we divide the number of atomic ops
-    by 2**16, etc.
+    The idea is to drastically eliminate atomic operations.
 */
 
+/** 64bit counters */
+typedef u64 counter_t;
+
 /** A collection of simple counters */
 
 typedef struct
 {
-  u16 **minis;	 /**< Per-thread u16 non-atomic counters */
-  u64 *maxi;	 /**< Shared wide counters */
-  u64 *value_at_last_clear; /**< Counter values as of last clear. */
-  u64 *value_at_last_serialize;	/**< Values as of last serialize. */
+  counter_t **counters;	 /**< Per-thread u64 non-atomic counters */
+  counter_t *value_at_last_serialize;	/**< Values as of last serialize. */
   u32 last_incremental_serialize_index;	/**< Last counter index
                                            serialized incrementally. */
 
   char *name;			/**< The counter collection's name. */
 } vlib_simple_counter_main_t;
 
+/** The number of counters (not the number of per-thread counters) */
+u32 vlib_simple_counter_n_counters (const vlib_simple_counter_main_t * cm);
+
 /** Increment a simple counter
     @param cm - (vlib_simple_counter_main_t *) simple counter main pointer
     @param cpu_index - (u32) the current cpu index
     @param index - (u32) index of the counter to increment
-    @param increment - (u32) quantitiy to add to the counter
+    @param increment - (u64) quantitiy to add to the counter
 */
 always_inline void
 vlib_increment_simple_counter (vlib_simple_counter_main_t * cm,
-			       u32 cpu_index, u32 index, u32 increment)
+			       u32 cpu_index, u32 index, u64 increment)
 {
-  u16 *my_minis;
-  u16 *mini;
-  u32 old, new;
+  counter_t *my_counters;
 
-  my_minis = cm->minis[cpu_index];
-  mini = vec_elt_at_index (my_minis, index);
-  old = mini[0];
-  new = old + increment;
-  mini[0] = new;
-
-  if (PREDICT_FALSE (mini[0] != new))
-    {
-      __sync_fetch_and_add (&cm->maxi[index], new);
-      my_minis[index] = 0;
-    }
+  my_counters = cm->counters[cpu_index];
+  my_counters[index] += increment;
 }
 
 /** Get the value of a simple counter
-    Scrapes the entire set of mini counters. Innacurate unless
+    Scrapes the entire set of per-thread counters. Innacurate unless
     worker threads which might increment the counter are
     barrier-synchronized
 
@@ -104,30 +93,21 @@
     @param index - (u32) index of the counter to fetch
     @returns - (u64) current counter value
 */
-always_inline u64
+always_inline counter_t
 vlib_get_simple_counter (vlib_simple_counter_main_t * cm, u32 index)
 {
-  u16 *my_minis, *mini;
-  u64 v;
+  counter_t *my_counters;
+  counter_t v;
   int i;
 
-  ASSERT (index < vec_len (cm->maxi));
+  ASSERT (index < vlib_simple_counter_n_counters (cm));
 
   v = 0;
 
-  for (i = 0; i < vec_len (cm->minis); i++)
+  for (i = 0; i < vec_len (cm->counters); i++)
     {
-      my_minis = cm->minis[i];
-      mini = vec_elt_at_index (my_minis, index);
-      v += mini[0];
-    }
-
-  v += cm->maxi[index];
-
-  if (index < vec_len (cm->value_at_last_clear))
-    {
-      ASSERT (v >= cm->value_at_last_clear[index]);
-      v -= cm->value_at_last_clear[index];
+      my_counters = cm->counters[i];
+      v += my_counters[index];
     }
 
   return v;
@@ -142,29 +122,24 @@
 always_inline void
 vlib_zero_simple_counter (vlib_simple_counter_main_t * cm, u32 index)
 {
-  u16 *my_minis;
+  counter_t *my_counters;
   int i;
 
-  ASSERT (index < vec_len (cm->maxi));
+  ASSERT (index < vlib_simple_counter_n_counters (cm));
 
-  for (i = 0; i < vec_len (cm->minis); i++)
+  for (i = 0; i < vec_len (cm->counters); i++)
     {
-      my_minis = cm->minis[i];
-      my_minis[index] = 0;
+      my_counters = cm->counters[i];
+      my_counters[index] = 0;
     }
-
-  cm->maxi[index] = 0;
-
-  if (index < vec_len (cm->value_at_last_clear))
-    cm->value_at_last_clear[index] = 0;
 }
 
 /** Combined counter to hold both packets and byte differences.
  */
 typedef struct
 {
-  u64 packets;			/**< packet counter */
-  u64 bytes;			/**< byte counter  */
+  counter_t packets;			/**< packet counter */
+  counter_t bytes;			/**< byte counter  */
 } vlib_counter_t;
 
 /** Add two combined counters, results in the first counter
@@ -201,24 +176,19 @@
   a->packets = a->bytes = 0;
 }
 
-/** Mini combined counter */
-typedef struct
-{
-  u16 packets;			/**< Packet count */
-  i16 bytes;			/**< Byte count */
-} vlib_mini_counter_t;
-
 /** A collection of combined counters */
 typedef struct
 {
-  vlib_mini_counter_t **minis;	/**< Per-thread u16 non-atomic counter pairs */
-  vlib_counter_t *maxi;		/**< Shared wide counter pairs */
-  vlib_counter_t *value_at_last_clear;	/**< Counter values as of last clear. */
+  vlib_counter_t **counters;	/**< Per-thread u64 non-atomic counter pairs */
   vlib_counter_t *value_at_last_serialize; /**< Counter values as of last serialize. */
   u32 last_incremental_serialize_index;	/**< Last counter index serialized incrementally. */
   char *name; /**< The counter collection's name. */
 } vlib_combined_counter_main_t;
 
+/** The number of counters (not the number of per-thread counters) */
+u32 vlib_combined_counter_n_counters (const vlib_combined_counter_main_t *
+				      cm);
+
 /** Clear a collection of simple counters
     @param cm - (vlib_simple_counter_main_t *) collection to clear
 */
@@ -233,62 +203,41 @@
     @param cm - (vlib_combined_counter_main_t *) comined counter main pointer
     @param cpu_index - (u32) the current cpu index
     @param index - (u32) index of the counter to increment
-    @param packet_increment - (u32) number of packets to add to the counter
-    @param byte_increment - (u32) number of bytes to add to the counter
+    @param packet_increment - (u64) number of packets to add to the counter
+    @param byte_increment - (u64) number of bytes to add to the counter
 */
 
 always_inline void
 vlib_increment_combined_counter (vlib_combined_counter_main_t * cm,
 				 u32 cpu_index,
-				 u32 index,
-				 u32 packet_increment, u32 byte_increment)
+				 u32 index, u64 n_packets, u64 n_bytes)
 {
-  vlib_mini_counter_t *my_minis, *mini;
-  u32 old_packets, new_packets;
-  i32 old_bytes, new_bytes;
+  vlib_counter_t *my_counters;
 
-  /* Use this CPU's mini counter array */
-  my_minis = cm->minis[cpu_index];
+  /* Use this CPU's counter array */
+  my_counters = cm->counters[cpu_index];
 
-  mini = vec_elt_at_index (my_minis, index);
-  old_packets = mini->packets;
-  old_bytes = mini->bytes;
-
-  new_packets = old_packets + packet_increment;
-  new_bytes = old_bytes + byte_increment;
-
-  mini->packets = new_packets;
-  mini->bytes = new_bytes;
-
-  /* Bytes always overflow before packets.. */
-  if (PREDICT_FALSE (mini->bytes != new_bytes))
-    {
-      vlib_counter_t *maxi = vec_elt_at_index (cm->maxi, index);
-
-      __sync_fetch_and_add (&maxi->packets, new_packets);
-      __sync_fetch_and_add (&maxi->bytes, new_bytes);
-
-      mini->packets = 0;
-      mini->bytes = 0;
-    }
+  my_counters[index].packets += n_packets;
+  my_counters[index].bytes += n_bytes;
 }
 
-#define vlib_prefetch_combined_counter(_cm, _cpu_index, _index)  \
-{                                                                \
-    vlib_mini_counter_t *_cpu_minis;                             \
-                                                                 \
-    /*                                                           \
-     * This CPU's mini index is assumed to already be in cache   \
-     */                                                          \
-    _cpu_minis = (_cm)->minis[(_cpu_index)];                     \
-    CLIB_PREFETCH(_cpu_minis + (_index),                         \
-                  sizeof(*_cpu_minis),                           \
-                  STORE);                                        \
+/** Pre-fetch a per-thread combined counter for the given object index */
+always_inline void
+vlib_prefetch_combined_counter (const vlib_combined_counter_main_t * cm,
+				u32 cpu_index, u32 index)
+{
+  vlib_counter_t *cpu_counters;
+
+  /*
+   * This CPU's index is assumed to already be in cache
+   */
+  cpu_counters = cm->counters[cpu_index];
+  CLIB_PREFETCH (cpu_counters + index, CLIB_CACHE_LINE_BYTES, STORE);
 }
 
 
 /** Get the value of a combined counter, never called in the speed path
-    Scrapes the entire set of mini counters. Innacurate unless
+    Scrapes the entire set of per-thread counters. Innacurate unless
     worker threads which might increment the counter are
     barrier-synchronized
 
@@ -298,35 +247,27 @@
 */
 
 static inline void
-vlib_get_combined_counter (vlib_combined_counter_main_t * cm,
+vlib_get_combined_counter (const vlib_combined_counter_main_t * cm,
 			   u32 index, vlib_counter_t * result)
 {
-  vlib_mini_counter_t *my_minis, *mini;
-  vlib_counter_t *maxi;
+  vlib_counter_t *my_counters, *counter;
   int i;
 
   result->packets = 0;
   result->bytes = 0;
 
-  for (i = 0; i < vec_len (cm->minis); i++)
+  for (i = 0; i < vec_len (cm->counters); i++)
     {
-      my_minis = cm->minis[i];
+      my_counters = cm->counters[i];
 
-      mini = vec_elt_at_index (my_minis, index);
-      result->packets += mini->packets;
-      result->bytes += mini->bytes;
+      counter = vec_elt_at_index (my_counters, index);
+      result->packets += counter->packets;
+      result->bytes += counter->bytes;
     }
-
-  maxi = vec_elt_at_index (cm->maxi, index);
-  result->packets += maxi->packets;
-  result->bytes += maxi->bytes;
-
-  if (index < vec_len (cm->value_at_last_clear))
-    vlib_counter_sub (result, &cm->value_at_last_clear[index]);
 }
 
 /** Clear a combined counter
-    Clears the set of per-thread u16 counters, and the shared vlib_counter_t
+    Clears the set of per-thread counters.
 
     @param cm - (vlib_combined_counter_main_t *) combined counter main pointer
     @param index - (u32) index of the counter to clear
@@ -334,21 +275,17 @@
 always_inline void
 vlib_zero_combined_counter (vlib_combined_counter_main_t * cm, u32 index)
 {
-  vlib_mini_counter_t *mini, *my_minis;
+  vlib_counter_t *my_counters, *counter;
   int i;
 
-  for (i = 0; i < vec_len (cm->minis); i++)
+  for (i = 0; i < vec_len (cm->counters); i++)
     {
-      my_minis = cm->minis[i];
+      my_counters = cm->counters[i];
 
-      mini = vec_elt_at_index (my_minis, index);
-      mini->packets = 0;
-      mini->bytes = 0;
+      counter = vec_elt_at_index (my_counters, index);
+      counter->packets = 0;
+      counter->bytes = 0;
     }
-
-  vlib_counter_zero (&cm->maxi[index]);
-  if (index < vec_len (cm->value_at_last_clear))
-    vlib_counter_zero (&cm->value_at_last_clear[index]);
 }
 
 /** validate a simple counter
diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c
index 0dad61d..7352c2e 100644
--- a/src/vnet/ip/ip4_forward.c
+++ b/src/vnet/ip/ip4_forward.c
@@ -2375,6 +2375,17 @@
 	  adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
 	  adj_index1 = vnet_buffer (p1)->ip.adj_index[VLIB_TX];
 
+	  /*
+	   * pre-fetch the per-adjacency counters
+	   */
+	  if (do_counters)
+	    {
+	      vlib_prefetch_combined_counter (&adjacency_counters,
+					      cpu_index, adj_index0);
+	      vlib_prefetch_combined_counter (&adjacency_counters,
+					      cpu_index, adj_index1);
+	    }
+
 	  /* We should never rewrite a pkt using the MISS adjacency */
 	  ASSERT (adj_index0 && adj_index1);
 
@@ -2480,17 +2491,6 @@
 	     rewrite_header.max_l3_packet_bytes ? IP4_ERROR_MTU_EXCEEDED :
 	     error1);
 
-	  /*
-	   * pre-fetch the per-adjacency counters
-	   */
-	  if (do_counters)
-	    {
-	      vlib_prefetch_combined_counter (&adjacency_counters,
-					      cpu_index, adj_index0);
-	      vlib_prefetch_combined_counter (&adjacency_counters,
-					      cpu_index, adj_index1);
-	    }
-
 	  /* Don't adjust the buffer for ttl issue; icmp-error node wants
 	   * to see the IP headerr */
 	  if (PREDICT_TRUE (error0 == IP4_ERROR_NONE))
@@ -2624,8 +2624,9 @@
 	      p0->flags &= ~VNET_BUFFER_LOCALLY_ORIGINATED;
 	    }
 
-	  vlib_prefetch_combined_counter (&adjacency_counters,
-					  cpu_index, adj_index0);
+	  if (do_counters)
+	    vlib_prefetch_combined_counter (&adjacency_counters,
+					    cpu_index, adj_index0);
 
 	  /* Guess we are only writing on simple Ethernet header. */
 	  vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t));
@@ -2641,10 +2642,11 @@
 	  rw_len0 = adj0[0].rewrite_header.data_bytes;
 	  vnet_buffer (p0)->ip.save_rewrite_length = rw_len0;
 
-	  vlib_increment_combined_counter
-	    (&adjacency_counters,
-	     cpu_index,
-	     adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0);
+	  if (do_counters)
+	    vlib_increment_combined_counter
+	      (&adjacency_counters,
+	       cpu_index, adj_index0, 1,
+	       vlib_buffer_length_in_chain (vm, p0) + rw_len0);
 
 	  /* Check MTU of outgoing interface. */
 	  error0 = (vlib_buffer_length_in_chain (vm, p0)
diff --git a/src/vnet/map/map.c b/src/vnet/map/map.c
index 7006b1d..99305af 100644
--- a/src/vnet/map/map.c
+++ b/src/vnet/map/map.c
@@ -1304,7 +1304,7 @@
   {
     which = cm - mm->domain_counters;
 
-    for (i = 0; i < vec_len (cm->maxi); i++)
+    for (i = 0; i < vlib_combined_counter_n_counters (cm); i++)
       {
 	vlib_get_combined_counter (cm, i, &v);
 	total_pkts[which] += v.packets;
diff --git a/src/vnet/map/map_api.c b/src/vnet/map/map_api.c
index 7febeb3..d618e7a 100644
--- a/src/vnet/map/map_api.c
+++ b/src/vnet/map/map_api.c
@@ -211,7 +211,7 @@
   {
     which = cm - mm->domain_counters;
 
-    for (i = 0; i < vec_len (cm->maxi); i++)
+    for (i = 0; i < vlib_combined_counter_n_counters (cm); i++)
       {
 	vlib_get_combined_counter (cm, i, &v);
 	total_pkts[which] += v.packets;
diff --git a/src/vnet/rewrite.c b/src/vnet/rewrite.c
index c4a171c..47fb74d 100644
--- a/src/vnet/rewrite.c
+++ b/src/vnet/rewrite.c
@@ -79,7 +79,7 @@
       if (NULL != si)
 	s = format (s, "%U: ", format_vnet_sw_interface_name, vnm, si);
       else
-	s = format (s, "DELETED");
+	s = format (s, "DELETED:%d", rw->sw_if_index);
     }
 
   /* Format rewrite string. */
diff --git a/src/vpp/api/api.c b/src/vpp/api/api.c
index d8301fa..8df4040 100644
--- a/src/vpp/api/api.c
+++ b/src/vpp/api/api.c
@@ -849,7 +849,7 @@
   {
     which = cm - im->combined_sw_if_counters;
 
-    for (i = 0; i < vec_len (cm->maxi); i++)
+    for (i = 0; i < vlib_combined_counter_n_counters (cm); i++)
       {
 	vlib_get_combined_counter (cm, i, &v);
 	total_pkts[which] += v.packets;
diff --git a/src/vpp/stats/stats.c b/src/vpp/stats/stats.c
index c46d441..1927da0 100644
--- a/src/vpp/stats/stats.c
+++ b/src/vpp/stats/stats.c
@@ -134,7 +134,7 @@
   vlib_simple_counter_main_t *cm;
   u32 items_this_message = 0;
   u64 v, *vp = 0;
-  int i;
+  int i, n_counts;
 
   /*
    * Prevent interface registration from expanding / moving the vectors...
@@ -144,13 +144,13 @@
 
   vec_foreach (cm, im->sw_if_counters)
   {
-
-    for (i = 0; i < vec_len (cm->maxi); i++)
+    n_counts = vlib_simple_counter_n_counters (cm);
+    for (i = 0; i < n_counts; i++)
       {
 	if (mp == 0)
 	  {
 	    items_this_message = clib_min (SIMPLE_COUNTER_BATCH_SIZE,
-					   vec_len (cm->maxi) - i);
+					   n_counts - i);
 
 	    mp = vl_msg_api_alloc_as_if_client
 	      (sizeof (*mp) + items_this_message * sizeof (v));
@@ -189,19 +189,19 @@
   vlib_combined_counter_main_t *cm;
   u32 items_this_message = 0;
   vlib_counter_t v, *vp = 0;
-  int i;
+  int i, n_counts;
 
   vnet_interface_counter_lock (im);
 
   vec_foreach (cm, im->combined_sw_if_counters)
   {
-
-    for (i = 0; i < vec_len (cm->maxi); i++)
+    n_counts = vlib_combined_counter_n_counters (cm);
+    for (i = 0; i < n_counts; i++)
       {
 	if (mp == 0)
 	  {
 	    items_this_message = clib_min (COMBINED_COUNTER_BATCH_SIZE,
-					   vec_len (cm->maxi) - i);
+					   n_counts - i);
 
 	    mp = vl_msg_api_alloc_as_if_client
 	      (sizeof (*mp) + items_this_message * sizeof (v));