Mtrie optimisations

1 - make the default route non-special, i.e. like any other less specific route. Consequently, all buckets have a valid valid index of either a leaf or a ply. Checks for special indeices in the data-path can thus be removed.
2 - since all leaves are now 'real' i.e. they represent a real load-balance object, to tell if a ply slot is 'empty' requeirs chekcing that the prefix length of the leaf occupying the slot is slot than the minium value for that ply.
3 - when removing a leaf find the cover first, then recurse down the ply and replace the old leaf with the cover. This saves us a ply walk.

Change-Id: Idd523019e8bb1b6ef527b1f5279a5e24bcf18332
Signed-off-by: Neale Ranns <nranns@cisco.com>
diff --git a/src/vnet/ip/ip4_mtrie.c b/src/vnet/ip/ip4_mtrie.c
index 6e3d0e8..317d8f1 100644
--- a/src/vnet/ip/ip4_mtrie.c
+++ b/src/vnet/ip/ip4_mtrie.c
@@ -40,14 +40,64 @@
 #include <vnet/ip/ip.h>
 #include <vnet/fib/fib_entry.h>
 
-static void
-ply_init (ip4_fib_mtrie_ply_t * p, ip4_fib_mtrie_leaf_t init,
-	  uword prefix_len)
+always_inline u32
+ip4_fib_mtrie_leaf_is_non_empty (ip4_fib_mtrie_ply_t * p, u8 dst_byte)
 {
-  p->n_non_empty_leafs =
-    ip4_fib_mtrie_leaf_is_empty (init) ? 0 : ARRAY_LEN (p->leaves);
+  /*
+   * It's 'non-empty' if the length of the leaf stored is greater than the
+   * length of a leaf in the covering ply. i.e. the leaf is more specific
+   * than it's would be cover in the covering ply
+   */
+  if (p->dst_address_bits_of_leaves[dst_byte] > p->dst_address_bits_base)
+    return (1);
+  return (0);
+}
+
+always_inline ip4_fib_mtrie_leaf_t
+ip4_fib_mtrie_leaf_set_adj_index (u32 adj_index)
+{
+  ip4_fib_mtrie_leaf_t l;
+  l = 1 + 2 * adj_index;
+  ASSERT (ip4_fib_mtrie_leaf_get_adj_index (l) == adj_index);
+  return l;
+}
+
+always_inline u32
+ip4_fib_mtrie_leaf_is_next_ply (ip4_fib_mtrie_leaf_t n)
+{
+  return (n & 1) == 0;
+}
+
+always_inline u32
+ip4_fib_mtrie_leaf_get_next_ply_index (ip4_fib_mtrie_leaf_t n)
+{
+  ASSERT (ip4_fib_mtrie_leaf_is_next_ply (n));
+  return n >> 1;
+}
+
+always_inline ip4_fib_mtrie_leaf_t
+ip4_fib_mtrie_leaf_set_next_ply_index (u32 i)
+{
+  ip4_fib_mtrie_leaf_t l;
+  l = 0 + 2 * i;
+  ASSERT (ip4_fib_mtrie_leaf_get_next_ply_index (l) == i);
+  return l;
+}
+
+static void
+ply_init (ip4_fib_mtrie_ply_t * p,
+	  ip4_fib_mtrie_leaf_t init, u32 prefix_len, u32 ply_base_len)
+{
+  /*
+   * A leaf is 'empty' if it represents a leaf from the covering PLY
+   * i.e. if the prefix length of the leaf is less than or equal to
+   * the prefix length of the PLY
+   */
+  p->n_non_empty_leafs = (prefix_len > ply_base_len ?
+			  ARRAY_LEN (p->leaves) : 0);
   memset (p->dst_address_bits_of_leaves, prefix_len,
 	  sizeof (p->dst_address_bits_of_leaves));
+  p->dst_address_bits_base = ply_base_len;
 
   /* Initialize leaves. */
 #ifdef CLIB_HAVE_VEC128
@@ -92,15 +142,16 @@
 }
 
 static ip4_fib_mtrie_leaf_t
-ply_create (ip4_fib_mtrie_t * m, ip4_fib_mtrie_leaf_t init_leaf,
-	    uword prefix_len)
+ply_create (ip4_fib_mtrie_t * m,
+	    ip4_fib_mtrie_leaf_t init_leaf,
+	    u32 leaf_prefix_len, u32 ply_base_len)
 {
   ip4_fib_mtrie_ply_t *p;
 
   /* Get cache aligned ply. */
   pool_get_aligned (m->ply_pool, p, sizeof (p[0]));
 
-  ply_init (p, init_leaf, prefix_len);
+  ply_init (p, init_leaf, leaf_prefix_len, ply_base_len);
   return ip4_fib_mtrie_leaf_set_next_ply_index (p - m->ply_pool);
 }
 
@@ -128,7 +179,7 @@
     }
 
   if (is_root)
-    ply_init (p, IP4_FIB_MTRIE_LEAF_EMPTY, /* prefix_len */ 0);
+    ply_init (p, IP4_FIB_MTRIE_LEAF_EMPTY, /* prefix_len */ 0, 0);
   else
     pool_put (m->ply_pool, p);
 }
@@ -140,38 +191,13 @@
   ply_free (m, root_ply);
 }
 
-u32
-ip4_mtrie_lookup_address (ip4_fib_mtrie_t * m, ip4_address_t dst)
-{
-  ip4_fib_mtrie_ply_t *p = pool_elt_at_index (m->ply_pool, 0);
-  ip4_fib_mtrie_leaf_t l;
-
-  l = p->leaves[dst.as_u8[0]];
-  if (ip4_fib_mtrie_leaf_is_terminal (l))
-    return ip4_fib_mtrie_leaf_get_adj_index (l);
-
-  p = get_next_ply_for_leaf (m, l);
-  l = p->leaves[dst.as_u8[1]];
-  if (ip4_fib_mtrie_leaf_is_terminal (l))
-    return ip4_fib_mtrie_leaf_get_adj_index (l);
-
-  p = get_next_ply_for_leaf (m, l);
-  l = p->leaves[dst.as_u8[2]];
-  if (ip4_fib_mtrie_leaf_is_terminal (l))
-    return ip4_fib_mtrie_leaf_get_adj_index (l);
-
-  p = get_next_ply_for_leaf (m, l);
-  l = p->leaves[dst.as_u8[3]];
-
-  ASSERT (ip4_fib_mtrie_leaf_is_terminal (l));
-  return ip4_fib_mtrie_leaf_get_adj_index (l);
-}
-
 typedef struct
 {
   ip4_address_t dst_address;
   u32 dst_address_length;
   u32 adj_index;
+  u32 cover_address_length;
+  u32 cover_adj_index;
 } ip4_fib_mtrie_set_unset_leaf_args_t;
 
 static void
@@ -184,7 +210,6 @@
   uword i;
 
   ASSERT (ip4_fib_mtrie_leaf_is_terminal (new_leaf));
-  ASSERT (!ip4_fib_mtrie_leaf_is_empty (new_leaf));
 
   for (i = 0; i < ARRAY_LEN (ply->leaves); i++)
     {
@@ -205,7 +230,7 @@
 	  __sync_val_compare_and_swap (&ply->leaves[i], old_leaf, new_leaf);
 	  ASSERT (ply->leaves[i] == new_leaf);
 	  ply->dst_address_bits_of_leaves[i] = new_leaf_dst_address_bits;
-	  ply->n_non_empty_leafs += ip4_fib_mtrie_leaf_is_empty (old_leaf);
+	  ply->n_non_empty_leafs += ip4_fib_mtrie_leaf_is_non_empty (ply, i);
 	}
     }
 }
@@ -219,7 +244,7 @@
   i32 n_dst_bits_next_plies;
   u8 dst_byte;
 
-  ASSERT (a->dst_address_length > 0 && a->dst_address_length <= 32);
+  ASSERT (a->dst_address_length >= 0 && a->dst_address_length <= 32);
   ASSERT (dst_address_byte_index < ARRAY_LEN (a->dst_address.as_u8));
 
   n_dst_bits_next_plies =
@@ -232,7 +257,7 @@
     {
       uword i, n_dst_bits_this_ply, old_leaf_is_terminal;
 
-      n_dst_bits_this_ply = -n_dst_bits_next_plies;
+      n_dst_bits_this_ply = clib_min (8, -n_dst_bits_next_plies);
       ASSERT ((a->dst_address.as_u8[dst_address_byte_index] &
 	       pow2_mask (n_dst_bits_this_ply)) == 0);
 
@@ -252,13 +277,16 @@
 
 	      if (old_leaf_is_terminal)
 		{
+		  old_ply->n_non_empty_leafs -=
+		    ip4_fib_mtrie_leaf_is_non_empty (old_ply, i);
 		  old_ply->dst_address_bits_of_leaves[i] =
 		    a->dst_address_length;
 		  __sync_val_compare_and_swap (&old_ply->leaves[i], old_leaf,
 					       new_leaf);
 		  ASSERT (old_ply->leaves[i] == new_leaf);
+
 		  old_ply->n_non_empty_leafs +=
-		    ip4_fib_mtrie_leaf_is_empty (old_leaf);
+		    ip4_fib_mtrie_leaf_is_non_empty (old_ply, i);
 		  ASSERT (old_ply->n_non_empty_leafs <=
 			  ARRAY_LEN (old_ply->leaves));
 		}
@@ -283,14 +311,20 @@
   else
     {
       ip4_fib_mtrie_ply_t *old_ply, *new_ply;
+      u8 ply_base_len;
 
+      ply_base_len = 8 * (dst_address_byte_index + 1);
       old_ply = pool_elt_at_index (m->ply_pool, old_ply_index);
       old_leaf = old_ply->leaves[dst_byte];
       if (ip4_fib_mtrie_leaf_is_terminal (old_leaf))
 	{
-	  new_leaf =
-	    ply_create (m, old_leaf,
-			old_ply->dst_address_bits_of_leaves[dst_byte]);
+	  old_ply->n_non_empty_leafs -=
+	    ip4_fib_mtrie_leaf_is_non_empty (old_ply, dst_byte);
+
+	  new_leaf = ply_create (m, old_leaf,
+				 clib_max (old_ply->dst_address_bits_of_leaves
+					   [dst_byte], ply_base_len),
+				 ply_base_len);
 	  new_ply = get_next_ply_for_leaf (m, new_leaf);
 
 	  /* Refetch since ply_create may move pool. */
@@ -299,14 +333,11 @@
 	  __sync_val_compare_and_swap (&old_ply->leaves[dst_byte], old_leaf,
 				       new_leaf);
 	  ASSERT (old_ply->leaves[dst_byte] == new_leaf);
-	  old_ply->dst_address_bits_of_leaves[dst_byte] = 0;
-
-	  old_ply->n_non_empty_leafs -=
-	    ip4_fib_mtrie_leaf_is_non_empty (old_leaf);
-	  ASSERT (old_ply->n_non_empty_leafs >= 0);
+	  old_ply->dst_address_bits_of_leaves[dst_byte] = ply_base_len;
 
 	  /* Account for the ply we just created. */
 	  old_ply->n_non_empty_leafs += 1;
+	  ASSERT (old_ply->n_non_empty_leafs >= 0);
 	}
       else
 	new_ply = get_next_ply_for_leaf (m, old_leaf);
@@ -325,7 +356,7 @@
   i32 i, n_dst_bits_this_ply, old_leaf_is_terminal;
   u8 dst_byte;
 
-  ASSERT (a->dst_address_length > 0 && a->dst_address_length <= 32);
+  ASSERT (a->dst_address_length >= 0 && a->dst_address_length <= 32);
   ASSERT (dst_address_byte_index < ARRAY_LEN (a->dst_address.as_u8));
 
   n_dst_bits_next_plies =
@@ -351,12 +382,17 @@
 	      && unset_leaf (m, a, get_next_ply_for_leaf (m, old_leaf),
 			     dst_address_byte_index + 1)))
 	{
-	  old_ply->leaves[i] = IP4_FIB_MTRIE_LEAF_EMPTY;
-	  old_ply->dst_address_bits_of_leaves[i] = 0;
+	  old_ply->n_non_empty_leafs -=
+	    ip4_fib_mtrie_leaf_is_non_empty (old_ply, i);
 
-	  /* No matter what we just deleted a non-empty leaf. */
-	  ASSERT (!ip4_fib_mtrie_leaf_is_empty (old_leaf));
-	  old_ply->n_non_empty_leafs -= 1;
+	  old_ply->leaves[i] =
+	    ip4_fib_mtrie_leaf_set_adj_index (a->cover_adj_index);
+	  old_ply->dst_address_bits_of_leaves[i] =
+	    clib_max (old_ply->dst_address_bits_base,
+		      a->cover_address_length);
+
+	  old_ply->n_non_empty_leafs +=
+	    ip4_fib_mtrie_leaf_is_non_empty (old_ply, i);
 
 	  ASSERT (old_ply->n_non_empty_leafs >= 0);
 	  if (old_ply->n_non_empty_leafs == 0 && dst_address_byte_index > 0)
@@ -365,6 +401,17 @@
 	      /* Old ply was deleted. */
 	      return 1;
 	    }
+#if CLIB_DEBUG > 0
+	  else if (dst_address_byte_index)
+	    {
+	      int ii, count = 0;
+	      for (ii = 0; ii < ARRAY_LEN (old_ply->leaves); ii++)
+		{
+		  count += ip4_fib_mtrie_leaf_is_non_empty (old_ply, ii);
+		}
+	      ASSERT (count);
+	    }
+#endif
 	}
     }
 
@@ -377,9 +424,7 @@
 {
   ip4_fib_mtrie_leaf_t root;
   memset (m, 0, sizeof (m[0]));
-  m->default_leaf = IP4_FIB_MTRIE_LEAF_EMPTY;
-  root = ply_create (m, IP4_FIB_MTRIE_LEAF_EMPTY,	/* dst_address_bits_of_leaves */
-		     0);
+  root = ply_create (m, IP4_FIB_MTRIE_LEAF_EMPTY, 0, 0);
   ASSERT (ip4_fib_mtrie_leaf_get_next_ply_index (root) == 0);
 }
 
@@ -406,25 +451,21 @@
 
   if (!is_del)
     {
-      if (dst_address_length == 0)
-	m->default_leaf = ip4_fib_mtrie_leaf_set_adj_index (adj_index);
-      else
-	set_leaf (m, &a, /* ply_index */ 0, /* dst_address_byte_index */ 0);
+      set_leaf (m, &a, /* ply_index */ 0, /* dst_address_byte_index */ 0);
     }
   else
     {
-      if (dst_address_length == 0)
-	m->default_leaf = IP4_FIB_MTRIE_LEAF_EMPTY;
+      ip4_main_t *im = &ip4_main;
 
-      else
+      if (dst_address_length)
 	{
-	  ip4_main_t *im = &ip4_main;
-	  uword i;
+	  word i;
 
-	  unset_leaf (m, &a, root_ply, 0);
-
-	  /* Find next less specific route and insert into mtrie. */
-	  for (i = dst_address_length - 1; i >= 1; i--)
+	  /* If the ply was not deleted, then we need to fill the
+	   * bucket just reset will the leaf from the less specfic
+	   * cover.
+	   * Find next less specific route and insert into mtrie. */
+	  for (i = dst_address_length - 1; i >= 0; i--)
 	    {
 	      uword *p;
 	      index_t lbi;
@@ -441,16 +482,21 @@
 		  if (INDEX_INVALID == lbi)
 		    continue;
 
-		  a.dst_address = key;
-		  a.adj_index = lbi;
-		  a.dst_address_length = i;
+		  a.cover_adj_index = lbi;
+		  a.cover_address_length = i;
 
-		  set_leaf (m, &a, /* ply_index */ 0,
-			    /* dst_address_byte_index */ 0);
 		  break;
 		}
 	    }
 	}
+      else
+	{
+	  a.cover_adj_index = 0;
+	  a.cover_address_length = 0;
+	}
+
+      /* the top level ply is never removed, so we can ignore the return code */
+      unset_leaf (m, &a, root_ply, 0);
     }
 }
 
@@ -483,10 +529,8 @@
 {
   ip4_fib_mtrie_leaf_t l = va_arg (*va, ip4_fib_mtrie_leaf_t);
 
-  if (ip4_fib_mtrie_leaf_is_empty (l))
-    s = format (s, "miss");
-  else if (ip4_fib_mtrie_leaf_is_terminal (l))
-    s = format (s, "adj %d", ip4_fib_mtrie_leaf_get_adj_index (l));
+  if (ip4_fib_mtrie_leaf_is_terminal (l))
+    s = format (s, "lb-index %d", ip4_fib_mtrie_leaf_get_adj_index (l));
   else
     s = format (s, "next ply %d", ip4_fib_mtrie_leaf_get_next_ply_index (l));
   return s;
@@ -511,7 +555,7 @@
     {
       ip4_fib_mtrie_leaf_t l = p->leaves[i];
 
-      if (!ip4_fib_mtrie_leaf_is_empty (l))
+      if (ip4_fib_mtrie_leaf_is_non_empty (p, i))
 	{
 	  u32 a, ia_length;
 	  ip4_address_t ia;