VPP-849: improve vnet classifier memory allocator performance

Port the linear-scan bucket fix from bihash_template.c.

Change-Id: Id8b2d1fe402401f098270ce6121c2f44f2f24c49
Signed-off-by: Dave Barach <dave@barachs.net>
diff --git a/src/vnet/classify/vnet_classify.h b/src/vnet/classify/vnet_classify.h
index 2c96663..ffe3dff 100644
--- a/src/vnet/classify/vnet_classify.h
+++ b/src/vnet/classify/vnet_classify.h
@@ -132,7 +132,8 @@
   union {
     struct {
       u32 offset;
-      u8 pad[3];
+      u8 linear_search;
+      u8 pad[2];
       u8 log2_pages;
     };
     u64 as_u64;
@@ -151,6 +152,7 @@
   u32 skip_n_vectors;
   u32 nbuckets;
   u32 log2_nbuckets;
+  u32 linear_buckets;
   int entries_per_page;
   u32 active_elements;
   u32 current_data_flag;
@@ -164,6 +166,7 @@
   
   /* Per-bucket working copies, one per thread */
   vnet_classify_entry_t ** working_copies;
+  int *working_copy_lengths;
   vnet_classify_bucket_t saved_bucket;
   
   /* Free entry freelists */
@@ -354,7 +357,7 @@
 static inline vnet_classify_entry_t *
 vnet_classify_find_entry_inline (vnet_classify_table_t * t,
                                  u8 * h, u64 hash, f64 now)
-  {
+{
   vnet_classify_entry_t * v;
   u32x4 *mask, *key;
   union {
@@ -364,6 +367,7 @@
   vnet_classify_bucket_t * b;
   u32 value_index;
   u32 bucket_index;
+  u32 limit;
   int i;
 
   bucket_index = hash & (t->nbuckets-1);
@@ -377,16 +381,23 @@
 
   v = vnet_classify_get_entry (t, b->offset);
   value_index = hash & ((1<<b->log2_pages)-1);
+  limit = t->entries_per_page;
+  if (PREDICT_FALSE (b->linear_search))
+    {
+      value_index = 0;
+      limit *= (1<<b->log2_pages);
+    }
+
   v = vnet_classify_entry_at_index (t, v, value_index);
 
 #ifdef CLASSIFY_USE_SSE
   if (U32X4_ALIGNED(h)) {
     u32x4 *data = (u32x4 *) h;
-    for (i = 0; i < t->entries_per_page; i++) {
+    for (i = 0; i < limit; i++) {
       key = v->key;
       result.as_u32x4 = (data[0 + t->skip_n_vectors] & mask[0]) ^ key[0];
       switch (t->match_n_vectors)
-      {
+        {
         case 5:
           result.as_u32x4 |= (data[4 + t->skip_n_vectors] & mask[4]) ^ key[4];
           /* FALLTHROUGH */
@@ -403,7 +414,7 @@
           break;
         default:
           abort();
-      }
+        }
 
       if (u32x4_zero_byte_mask (result.as_u32x4) == 0xffff) {
         if (PREDICT_TRUE(now)) {
@@ -416,51 +427,51 @@
     }
   } else
 #endif /* CLASSIFY_USE_SSE */
-  {
-    u32 skip_u64 = t->skip_n_vectors * 2;
-    u64 *data64 = (u64 *)h;
-    for (i = 0; i < t->entries_per_page; i++) {
-      key = v->key;
+    {
+      u32 skip_u64 = t->skip_n_vectors * 2;
+      u64 *data64 = (u64 *)h;
+      for (i = 0; i < limit; i++) {
+        key = v->key;
 
-      result.as_u64[0] = (data64[0 + skip_u64] & ((u64 *)mask)[0]) ^ ((u64 *)key)[0];
-      result.as_u64[1] = (data64[1 + skip_u64] & ((u64 *)mask)[1]) ^ ((u64 *)key)[1];
-      switch (t->match_n_vectors)
-      {
-        case 5:
-          result.as_u64[0] |= (data64[8 + skip_u64] & ((u64 *)mask)[8]) ^ ((u64 *)key)[8];
-          result.as_u64[1] |= (data64[9 + skip_u64] & ((u64 *)mask)[9]) ^ ((u64 *)key)[9];
-          /* FALLTHROUGH */
-        case 4:
-          result.as_u64[0] |= (data64[6 + skip_u64] & ((u64 *)mask)[6]) ^ ((u64 *)key)[6];
-          result.as_u64[1] |= (data64[7 + skip_u64] & ((u64 *)mask)[7]) ^ ((u64 *)key)[7];
-          /* FALLTHROUGH */
-        case 3:
-          result.as_u64[0] |= (data64[4 + skip_u64] & ((u64 *)mask)[4]) ^ ((u64 *)key)[4];
-          result.as_u64[1] |= (data64[5 + skip_u64] & ((u64 *)mask)[5]) ^ ((u64 *)key)[5];
-          /* FALLTHROUGH */
-        case 2:
-          result.as_u64[0] |= (data64[2 + skip_u64] & ((u64 *)mask)[2]) ^ ((u64 *)key)[2];
-          result.as_u64[1] |= (data64[3 + skip_u64] & ((u64 *)mask)[3]) ^ ((u64 *)key)[3];
-          /* FALLTHROUGH */
-        case 1:
-          break;
-        default:
-          abort();
-      }
+        result.as_u64[0] = (data64[0 + skip_u64] & ((u64 *)mask)[0]) ^ ((u64 *)key)[0];
+        result.as_u64[1] = (data64[1 + skip_u64] & ((u64 *)mask)[1]) ^ ((u64 *)key)[1];
+        switch (t->match_n_vectors)
+          {
+          case 5:
+            result.as_u64[0] |= (data64[8 + skip_u64] & ((u64 *)mask)[8]) ^ ((u64 *)key)[8];
+            result.as_u64[1] |= (data64[9 + skip_u64] & ((u64 *)mask)[9]) ^ ((u64 *)key)[9];
+            /* FALLTHROUGH */
+          case 4:
+            result.as_u64[0] |= (data64[6 + skip_u64] & ((u64 *)mask)[6]) ^ ((u64 *)key)[6];
+            result.as_u64[1] |= (data64[7 + skip_u64] & ((u64 *)mask)[7]) ^ ((u64 *)key)[7];
+            /* FALLTHROUGH */
+          case 3:
+            result.as_u64[0] |= (data64[4 + skip_u64] & ((u64 *)mask)[4]) ^ ((u64 *)key)[4];
+            result.as_u64[1] |= (data64[5 + skip_u64] & ((u64 *)mask)[5]) ^ ((u64 *)key)[5];
+            /* FALLTHROUGH */
+          case 2:
+            result.as_u64[0] |= (data64[2 + skip_u64] & ((u64 *)mask)[2]) ^ ((u64 *)key)[2];
+            result.as_u64[1] |= (data64[3 + skip_u64] & ((u64 *)mask)[3]) ^ ((u64 *)key)[3];
+            /* FALLTHROUGH */
+          case 1:
+            break;
+          default:
+            abort();
+          }
 
-      if (result.as_u64[0] == 0 && result.as_u64[1] == 0) {
-        if (PREDICT_TRUE(now)) {
-          v->hits++;
-          v->last_heard = now;
+        if (result.as_u64[0] == 0 && result.as_u64[1] == 0) {
+          if (PREDICT_TRUE(now)) {
+            v->hits++;
+            v->last_heard = now;
+          }
+          return (v);
         }
-        return (v);
-      }
 
-      v = vnet_classify_entry_at_index (t, v, 1);
+        v = vnet_classify_entry_at_index (t, v, 1);
+      }
     }
-  }
   return 0;
-  }
+}
 
 vnet_classify_table_t * 
 vnet_classify_new_table (vnet_classify_main_t *cm,