vppinfra: use count_trailing_zeros in sparse_vec_index

It is much cheaper to use ctzll than to do shift,subtract and mask
in likely case when we are looking for 1st set bit in the uword.

Change-Id: I31954081571978878c7098bafad0c85a91755fa2
Signed-off-by: Damjan Marion <damarion@cisco.com>
diff --git a/src/plugins/dpdk/device/cli.c b/src/plugins/dpdk/device/cli.c
index 2eaf17e..08bf0c9 100644
--- a/src/plugins/dpdk/device/cli.c
+++ b/src/plugins/dpdk/device/cli.c
@@ -1323,20 +1323,20 @@
 	xd->hqos_wt[worker_thread_first + i].hqos_field0_slabpos = offset;
 	xd->hqos_wt[worker_thread_first + i].hqos_field0_slabmask = mask;
 	xd->hqos_wt[worker_thread_first + i].hqos_field0_slabshr =
-	  __builtin_ctzll (mask);
+	  count_trailing_zeros (mask);
 	break;
       case 1:
 	xd->hqos_wt[worker_thread_first + i].hqos_field1_slabpos = offset;
 	xd->hqos_wt[worker_thread_first + i].hqos_field1_slabmask = mask;
 	xd->hqos_wt[worker_thread_first + i].hqos_field1_slabshr =
-	  __builtin_ctzll (mask);
+	  count_trailing_zeros (mask);
 	break;
       case 2:
       default:
 	xd->hqos_wt[worker_thread_first + i].hqos_field2_slabpos = offset;
 	xd->hqos_wt[worker_thread_first + i].hqos_field2_slabmask = mask;
 	xd->hqos_wt[worker_thread_first + i].hqos_field2_slabshr =
-	  __builtin_ctzll (mask);
+	  count_trailing_zeros (mask);
       }
 
 done:
diff --git a/src/plugins/dpdk/hqos/hqos.c b/src/plugins/dpdk/hqos/hqos.c
index c9b8565..123ecba 100644
--- a/src/plugins/dpdk/hqos/hqos.c
+++ b/src/plugins/dpdk/hqos/hqos.c
@@ -168,8 +168,8 @@
 dpdk_hqos_validate_mask (u64 mask, u32 n)
 {
   int count = __builtin_popcountll (mask);
-  int pos_lead = sizeof (u64) * 8 - __builtin_clzll (mask);
-  int pos_trail = __builtin_ctzll (mask);
+  int pos_lead = sizeof (u64) * 8 - count_leading_zeros (mask);
+  int pos_trail = count_trailing_zeros (mask);
   int count_expected = __builtin_popcount (n - 1);
 
   /* Handle the exceptions */
@@ -363,15 +363,15 @@
       xd->hqos_wt[tid].hqos_field0_slabpos = hqos->pktfield0_slabpos;
       xd->hqos_wt[tid].hqos_field0_slabmask = hqos->pktfield0_slabmask;
       xd->hqos_wt[tid].hqos_field0_slabshr =
-	__builtin_ctzll (hqos->pktfield0_slabmask);
+	count_trailing_zeros (hqos->pktfield0_slabmask);
       xd->hqos_wt[tid].hqos_field1_slabpos = hqos->pktfield1_slabpos;
       xd->hqos_wt[tid].hqos_field1_slabmask = hqos->pktfield1_slabmask;
       xd->hqos_wt[tid].hqos_field1_slabshr =
-	__builtin_ctzll (hqos->pktfield1_slabmask);
+	count_trailing_zeros (hqos->pktfield1_slabmask);
       xd->hqos_wt[tid].hqos_field2_slabpos = hqos->pktfield2_slabpos;
       xd->hqos_wt[tid].hqos_field2_slabmask = hqos->pktfield2_slabmask;
       xd->hqos_wt[tid].hqos_field2_slabshr =
-	__builtin_ctzll (hqos->pktfield2_slabmask);
+	count_trailing_zeros (hqos->pktfield2_slabmask);
       memcpy (xd->hqos_wt[tid].hqos_tc_table, hqos->tc_table,
 	      sizeof (hqos->tc_table));
     }
diff --git a/src/svm/svm.c b/src/svm/svm.c
index 16a58fa..e3a734c 100644
--- a/src/svm/svm.c
+++ b/src/svm/svm.c
@@ -83,7 +83,7 @@
   unformat_free (&input);
   close (fd);
 
-  count_leading_zeros (bits, end);
+  bits = count_leading_zeros (end);
   bits = 64 - bits;
   if (bits >= 36 && bits <= 48)
     return ((1ul << bits) / 4) - (2 * SVM_GLOBAL_REGION_SIZE);
diff --git a/src/vnet/devices/virtio/vhost-user.c b/src/vnet/devices/virtio/vhost-user.c
index c1fca44..7e10a60 100644
--- a/src/vnet/devices/virtio/vhost-user.c
+++ b/src/vnet/devices/virtio/vhost-user.c
@@ -248,8 +248,8 @@
   r = _mm_blend_epi16 (r, _mm_and_si128 (rl, rh), 0x88);
 
   r = _mm_shuffle_epi8 (r, _mm_set_epi64x (0, 0x0e060c040a020800));
-  i = __builtin_ctzll (_mm_movemask_epi8 (r) |
-		       (1 << VHOST_MEMORY_MAX_NREGIONS));
+  i = count_trailing_zeros (_mm_movemask_epi8 (r) |
+			    (1 << VHOST_MEMORY_MAX_NREGIONS));
 
   if (i < vui->nregions)
     {
@@ -275,7 +275,7 @@
 
   if (u32)
     {
-      i = __builtin_ctzll (u32);
+      i = count_trailing_zeros (u32);
       goto vhost_map_guest_mem_done;
     }
 
@@ -290,7 +290,7 @@
 
   if (u32)
     {
-      i = __builtin_ctzll (u32);
+      i = count_trailing_zeros (u32);
       goto vhost_map_guest_mem_done;
     }
 
@@ -305,7 +305,7 @@
 
   if (u32)
     {
-      i = __builtin_ctzll (u32);
+      i = count_trailing_zeros (u32);
       goto vhost_map_guest_mem_done;
     }
 
@@ -318,7 +318,7 @@
   u32 |= ((vgetq_lane_u8 (vreinterpretq_u8_u64 (r), 0) & 0x1) << 6);
   u32 |= ((vgetq_lane_u8 (vreinterpretq_u8_u64 (r), 8) & 0x1) << 7);
 
-  i = __builtin_ctzll (u32 | (1 << VHOST_MEMORY_MAX_NREGIONS));
+  i = count_trailing_zeros (u32 | (1 << VHOST_MEMORY_MAX_NREGIONS));
 
 vhost_map_guest_mem_done:
   if (i < vui->nregions)
diff --git a/src/vnet/l2/feat_bitmap.h b/src/vnet/l2/feat_bitmap.h
index 5940ff7..a1b295a 100644
--- a/src/vnet/l2/feat_bitmap.h
+++ b/src/vnet/l2/feat_bitmap.h
@@ -80,7 +80,7 @@
 {
   u32 first_bit;
 
-  count_leading_zeros (first_bit, bitmap);
+  first_bit = count_leading_zeros (bitmap);
   first_bit = uword_bits - 1 - first_bit;
   return next_nodes[first_bit];
 }
diff --git a/src/vppinfra/bitmap.h b/src/vppinfra/bitmap.h
index 9e1ae49..dbf9eeb 100644
--- a/src/vppinfra/bitmap.h
+++ b/src/vppinfra/bitmap.h
@@ -409,7 +409,7 @@
       if (x != 0)
 	{
 	  uword first_bit;
-	  count_leading_zeros (first_bit, x);
+	  first_bit = count_leading_zeros (x);
 	  return (i) * BITS (ai[0]) - first_bit - 1;
 	}
     }
diff --git a/src/vppinfra/clib.h b/src/vppinfra/clib.h
index 0d059a0..42748b0 100644
--- a/src/vppinfra/clib.h
+++ b/src/vppinfra/clib.h
@@ -125,71 +125,20 @@
   decl
 
 /* Use __builtin_clz if available. */
-#ifdef __GNUC__
-#include <features.h>
-#if __GNUC_PREREQ(3, 4)
 #if uword_bits == 64
-#define count_leading_zeros(count,x) count = __builtin_clzll (x)
-#define count_trailing_zeros(count,x) count = __builtin_ctzll (x)
+#define count_leading_zeros(x) __builtin_clzll (x)
+#define count_trailing_zeros(x) __builtin_ctzll (x)
 #else
-#define count_leading_zeros(count,x) count = __builtin_clzl (x)
-#define count_trailing_zeros(count,x) count = __builtin_ctzl (x)
+#define count_leading_zeros(x) __builtin_clzl (x)
+#define count_trailing_zeros(x) __builtin_ctzl (x)
 #endif
-#endif
-#endif
-
-#ifndef count_leading_zeros
-
-/* Misc. integer arithmetic functions. */
-#if defined (i386)
-#define count_leading_zeros(count, x)		\
-  do {						\
-    word _clz;					\
-    __asm__ ("bsrl %1,%0"			\
-	     : "=r" (_clz) : "rm" ((word) (x)));\
-    (count) = _clz ^ 31;			\
-  } while (0)
-
-#define count_trailing_zeros(count, x)			\
-  __asm__ ("bsfl %1,%0" : "=r" (count) : "rm" ((word)(x)))
-#endif /* i386 */
-
-#if defined (__alpha__) && defined (HAVE_CIX)
-#define count_leading_zeros(count, x)		\
-  __asm__ ("ctlz %1,%0"				\
-	   : "=r" ((word) (count))		\
-	   : "r" ((word) (x)))
-#define count_trailing_zeros(count, x)		\
-  __asm__ ("cttz %1,%0"				\
-	   : "=r" ((word) (count))		\
-	   : "r" ((word) (x)))
-#endif /* alpha && HAVE_CIX */
-
-#if __mips >= 4
-
-/* Select between 32/64 opcodes. */
-#if uword_bits == 32
-#define count_leading_zeros(_count, _x)		\
-  __asm__ ("clz %[count],%[x]"			\
-	   : [count] "=r" ((word) (_count))	\
-	   : [x] "r" ((word) (_x)))
-#else
-#define count_leading_zeros(_count, _x)		\
-  __asm__ ("dclz %[count],%[x]"			\
-	   : [count] "=r" ((word) (_count))	\
-	   : [x] "r" ((word) (_x)))
-#endif
-
-#endif /* __mips >= 4 */
-
-#endif /* count_leading_zeros */
 
 #if defined (count_leading_zeros)
 always_inline uword
 min_log2 (uword x)
 {
   uword n;
-  count_leading_zeros (n, x);
+  n = count_leading_zeros (x);
   return BITS (uword) - n - 1;
 }
 #else
@@ -305,7 +254,7 @@
 {
   uword result;
 #ifdef count_trailing_zeros
-  count_trailing_zeros (result, x);
+  result = count_trailing_zeros (x);
 #else
   result = min_log2 (first_set (x));
 #endif
diff --git a/src/vppinfra/sparse_vec.h b/src/vppinfra/sparse_vec.h
index 0da154d..cfa5778 100644
--- a/src/vppinfra/sparse_vec.h
+++ b/src/vppinfra/sparse_vec.h
@@ -108,15 +108,20 @@
 
   h = sparse_vec_header (v);
   i = sparse_index / BITS (h->is_member_bitmap[0]);
-  b = (uword) 1 << (uword) (sparse_index % BITS (h->is_member_bitmap[0]));
+  b = sparse_index % BITS (h->is_member_bitmap[0]);
 
   ASSERT (i < vec_len (h->is_member_bitmap));
   ASSERT (i < vec_len (h->member_counts));
 
   w = h->is_member_bitmap[i];
-  d = h->member_counts[i] + count_set_bits (w & (b - 1));
 
-  is_member = (w & b) != 0;
+  if (PREDICT_TRUE (maybe_range == 0 && insert == 0 &&
+		    count_trailing_zeros (w) == b))
+    return h->member_counts[i] + 1;
+
+  d = h->member_counts[i] + count_set_bits (w & ((1ULL << b) - 1));
+  is_member = (w & (1ULL << b)) != 0;
+
   if (maybe_range)
     {
       u8 r = h->range_flags[d];
@@ -134,7 +139,7 @@
       if (!is_member)
 	{
 	  uword j;
-	  w |= b;
+	  w |= 1ULL << b;
 	  h->is_member_bitmap[i] = w;
 	  for (j = i + 1; j < vec_len (h->member_counts); j++)
 	    h->member_counts[j] += 1;
@@ -170,8 +175,8 @@
   i0 = si0 / BITS (h->is_member_bitmap[0]);
   i1 = si1 / BITS (h->is_member_bitmap[0]);
 
-  b0 = (uword) 1 << (uword) (si0 % BITS (h->is_member_bitmap[0]));
-  b1 = (uword) 1 << (uword) (si1 % BITS (h->is_member_bitmap[0]));
+  b0 = si0 % BITS (h->is_member_bitmap[0]);
+  b1 = si1 % BITS (h->is_member_bitmap[0]);
 
   ASSERT (i0 < vec_len (h->is_member_bitmap));
   ASSERT (i1 < vec_len (h->is_member_bitmap));
@@ -182,8 +187,16 @@
   w0 = h->is_member_bitmap[i0];
   w1 = h->is_member_bitmap[i1];
 
-  v0 = w0 & (b0 - 1);
-  v1 = w1 & (b1 - 1);
+  if (PREDICT_TRUE ((count_trailing_zeros (w0) == b0) +
+		    (count_trailing_zeros (w1) == b1) == 2))
+    {
+      *i0_return = h->member_counts[i0] + 1;
+      *i1_return = h->member_counts[i1] + 1;
+      return;
+    }
+
+  v0 = w0 & ((1ULL << b0) - 1);
+  v1 = w1 & ((1ULL << b1) - 1);
 
   /* Speculate that masks will have zero or one bits set. */
   d0 = h->member_counts[i0] + (v0 != 0);
@@ -196,8 +209,8 @@
       d1 += count_set_bits (v1) - (v1 != 0);
     }
 
-  is_member0 = (w0 & b0) != 0;
-  is_member1 = (w1 & b1) != 0;
+  is_member0 = (w0 & (1ULL << b0)) != 0;
+  is_member1 = (w1 & (1ULL << b1)) != 0;
 
   d0 = is_member0 ? d0 : 0;
   d1 = is_member1 ? d1 : 0;