bihash key compare improvements

Looks like CPU doesn't like overlaping loads.
This new codes in some cases shows 3-4 clock improvements.

Change-Id: Ia1b49976ad95140c573f892fdc0a32eebbfa06c8
Signed-off-by: Damjan Marion <damarion@cisco.com>
diff --git a/src/vppinfra/bihash_40_8.h b/src/vppinfra/bihash_40_8.h
index 9ceecfc..90adb8f 100644
--- a/src/vppinfra/bihash_40_8.h
+++ b/src/vppinfra/bihash_40_8.h
@@ -74,15 +74,13 @@
   v = u64x8_load_unaligned (a) ^ u64x8_load_unaligned (b);
   return (u64x8_is_zero_mask (v) & 0x1f) == 0;
 #elif defined (CLIB_HAVE_VEC256)
-  u64x4 v;
-  v = u64x4_load_unaligned (a) ^ u64x4_load_unaligned (b);
-  v |= u64x4_load_unaligned (a + 1) ^ u64x4_load_unaligned (b + 1);
+  u64x4 v = { a[4] ^ b[4], 0, 0, 0 };
+  v |= u64x4_load_unaligned (a) ^ u64x4_load_unaligned (b);
   return u64x4_is_all_zero (v);
 #elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE)
-  u64x2 v;
-  v = u64x2_load_unaligned (a) ^ u64x2_load_unaligned (b);
+  u64x2 v = { a[4] ^ b[4], 0 };
+  v |= u64x2_load_unaligned (a) ^ u64x2_load_unaligned (b);
   v |= u64x2_load_unaligned (a + 2) ^ u64x2_load_unaligned (b + 2);
-  v |= u64x2_load_unaligned (a + 3) ^ u64x2_load_unaligned (b + 3);
   return u64x2_is_all_zero (v);
 #else
   return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2]) | (a[3] ^ b[3])