classify: use vector code even when data is not aligned

Type: feature

Change-Id: I8f5f4841965beb13ebc8c2a37ce0dc331c920109
Signed-off-by: Damjan Marion <damarion@cisco.com>
diff --git a/src/vnet/classify/vnet_classify.h b/src/vnet/classify/vnet_classify.h
index 620ef9a..35a5db3 100644
--- a/src/vnet/classify/vnet_classify.h
+++ b/src/vnet/classify/vnet_classify.h
@@ -28,8 +28,6 @@
 
 #define CLASSIFY_TRACE 0
 
-#define U32X4_ALIGNED(p) PREDICT_TRUE((((intptr_t)p) & 0xf) == 0)
-
 /*
  * Classify table option to process packets
  *  CLASSIFY_FLAG_USE_CURR_DATA:
@@ -218,62 +216,57 @@
   ASSERT (t);
   mask = t->mask;
 #ifdef CLIB_HAVE_VEC128
-  if (U32X4_ALIGNED (h))
-    {				//SSE can't handle unaligned data
-      u32x4 *data = (u32x4 *) h;
-      xor_sum.as_u32x4 = data[0 + t->skip_n_vectors] & mask[0];
-      switch (t->match_n_vectors)
-	{
-	case 5:
-	  xor_sum.as_u32x4 ^= data[4 + t->skip_n_vectors] & mask[4];
-	  /* FALLTHROUGH */
-	case 4:
-	  xor_sum.as_u32x4 ^= data[3 + t->skip_n_vectors] & mask[3];
-	  /* FALLTHROUGH */
-	case 3:
-	  xor_sum.as_u32x4 ^= data[2 + t->skip_n_vectors] & mask[2];
-	  /* FALLTHROUGH */
-	case 2:
-	  xor_sum.as_u32x4 ^= data[1 + t->skip_n_vectors] & mask[1];
-	  /* FALLTHROUGH */
-	case 1:
-	  break;
-	default:
-	  abort ();
-	}
-    }
-  else
-#endif /* CLIB_HAVE_VEC128 */
+  u32x4u *data = (u32x4u *) h;
+  xor_sum.as_u32x4 = data[0 + t->skip_n_vectors] & mask[0];
+  switch (t->match_n_vectors)
     {
-      u32 skip_u64 = t->skip_n_vectors * 2;
-      u64 *data64 = (u64 *) h;
-      xor_sum.as_u64[0] = data64[0 + skip_u64] & ((u64 *) mask)[0];
-      xor_sum.as_u64[1] = data64[1 + skip_u64] & ((u64 *) mask)[1];
-      switch (t->match_n_vectors)
-	{
-	case 5:
-	  xor_sum.as_u64[0] ^= data64[8 + skip_u64] & ((u64 *) mask)[8];
-	  xor_sum.as_u64[1] ^= data64[9 + skip_u64] & ((u64 *) mask)[9];
-	  /* FALLTHROUGH */
-	case 4:
-	  xor_sum.as_u64[0] ^= data64[6 + skip_u64] & ((u64 *) mask)[6];
-	  xor_sum.as_u64[1] ^= data64[7 + skip_u64] & ((u64 *) mask)[7];
-	  /* FALLTHROUGH */
-	case 3:
-	  xor_sum.as_u64[0] ^= data64[4 + skip_u64] & ((u64 *) mask)[4];
-	  xor_sum.as_u64[1] ^= data64[5 + skip_u64] & ((u64 *) mask)[5];
-	  /* FALLTHROUGH */
-	case 2:
-	  xor_sum.as_u64[0] ^= data64[2 + skip_u64] & ((u64 *) mask)[2];
-	  xor_sum.as_u64[1] ^= data64[3 + skip_u64] & ((u64 *) mask)[3];
-	  /* FALLTHROUGH */
-	case 1:
-	  break;
-
-	default:
-	  abort ();
-	}
+    case 5:
+      xor_sum.as_u32x4 ^= data[4 + t->skip_n_vectors] & mask[4];
+      /* FALLTHROUGH */
+    case 4:
+      xor_sum.as_u32x4 ^= data[3 + t->skip_n_vectors] & mask[3];
+      /* FALLTHROUGH */
+    case 3:
+      xor_sum.as_u32x4 ^= data[2 + t->skip_n_vectors] & mask[2];
+      /* FALLTHROUGH */
+    case 2:
+      xor_sum.as_u32x4 ^= data[1 + t->skip_n_vectors] & mask[1];
+      /* FALLTHROUGH */
+    case 1:
+      break;
+    default:
+      abort ();
     }
+#else
+  u32 skip_u64 = t->skip_n_vectors * 2;
+  u64 *data64 = (u64 *) h;
+  xor_sum.as_u64[0] = data64[0 + skip_u64] & ((u64 *) mask)[0];
+  xor_sum.as_u64[1] = data64[1 + skip_u64] & ((u64 *) mask)[1];
+  switch (t->match_n_vectors)
+    {
+    case 5:
+      xor_sum.as_u64[0] ^= data64[8 + skip_u64] & ((u64 *) mask)[8];
+      xor_sum.as_u64[1] ^= data64[9 + skip_u64] & ((u64 *) mask)[9];
+      /* FALLTHROUGH */
+    case 4:
+      xor_sum.as_u64[0] ^= data64[6 + skip_u64] & ((u64 *) mask)[6];
+      xor_sum.as_u64[1] ^= data64[7 + skip_u64] & ((u64 *) mask)[7];
+      /* FALLTHROUGH */
+    case 3:
+      xor_sum.as_u64[0] ^= data64[4 + skip_u64] & ((u64 *) mask)[4];
+      xor_sum.as_u64[1] ^= data64[5 + skip_u64] & ((u64 *) mask)[5];
+      /* FALLTHROUGH */
+    case 2:
+      xor_sum.as_u64[0] ^= data64[2 + skip_u64] & ((u64 *) mask)[2];
+      xor_sum.as_u64[1] ^= data64[3 + skip_u64] & ((u64 *) mask)[3];
+      /* FALLTHROUGH */
+    case 1:
+      break;
+
+    default:
+      abort ();
+    }
+#endif /* CLIB_HAVE_VEC128 */
 
   return clib_xxhash (xor_sum.as_u64[0] ^ xor_sum.as_u64[1]);
 }
@@ -392,107 +385,98 @@
   v = vnet_classify_entry_at_index (t, v, value_index);
 
 #ifdef CLIB_HAVE_VEC128
-  if (U32X4_ALIGNED (h))
+  u32x4u *data = (u32x4u *) h;
+  for (i = 0; i < limit; i++)
     {
-      u32x4 *data = (u32x4 *) h;
-      for (i = 0; i < limit; i++)
+      key = v->key;
+      result.as_u32x4 = (data[0 + t->skip_n_vectors] & mask[0]) ^ key[0];
+      switch (t->match_n_vectors)
 	{
-	  key = v->key;
-	  result.as_u32x4 = (data[0 + t->skip_n_vectors] & mask[0]) ^ key[0];
-	  switch (t->match_n_vectors)
-	    {
-	    case 5:
-	      result.as_u32x4 |=
-		(data[4 + t->skip_n_vectors] & mask[4]) ^ key[4];
-	      /* FALLTHROUGH */
-	    case 4:
-	      result.as_u32x4 |=
-		(data[3 + t->skip_n_vectors] & mask[3]) ^ key[3];
-	      /* FALLTHROUGH */
-	    case 3:
-	      result.as_u32x4 |=
-		(data[2 + t->skip_n_vectors] & mask[2]) ^ key[2];
-	      /* FALLTHROUGH */
-	    case 2:
-	      result.as_u32x4 |=
-		(data[1 + t->skip_n_vectors] & mask[1]) ^ key[1];
-	      /* FALLTHROUGH */
-	    case 1:
-	      break;
-	    default:
-	      abort ();
-	    }
-
-	  if (u32x4_zero_byte_mask (result.as_u32x4) == 0xffff)
-	    {
-	      if (PREDICT_TRUE (now))
-		{
-		  v->hits++;
-		  v->last_heard = now;
-		}
-	      return (v);
-	    }
-	  v = vnet_classify_entry_at_index (t, v, 1);
+	case 5:
+	  result.as_u32x4 |= (data[4 + t->skip_n_vectors] & mask[4]) ^ key[4];
+	  /* FALLTHROUGH */
+	case 4:
+	  result.as_u32x4 |= (data[3 + t->skip_n_vectors] & mask[3]) ^ key[3];
+	  /* FALLTHROUGH */
+	case 3:
+	  result.as_u32x4 |= (data[2 + t->skip_n_vectors] & mask[2]) ^ key[2];
+	  /* FALLTHROUGH */
+	case 2:
+	  result.as_u32x4 |= (data[1 + t->skip_n_vectors] & mask[1]) ^ key[1];
+	  /* FALLTHROUGH */
+	case 1:
+	  break;
+	default:
+	  abort ();
 	}
+
+      if (u32x4_zero_byte_mask (result.as_u32x4) == 0xffff)
+	{
+	  if (PREDICT_TRUE (now))
+	    {
+	      v->hits++;
+	      v->last_heard = now;
+	    }
+	  return (v);
+	}
+      v = vnet_classify_entry_at_index (t, v, 1);
     }
-  else
+#else
+  u32 skip_u64 = t->skip_n_vectors * 2;
+  u64 *data64 = (u64 *) h;
+  for (i = 0; i < limit; i++)
+    {
+      key = v->key;
+
+      result.as_u64[0] =
+	(data64[0 + skip_u64] & ((u64 *) mask)[0]) ^ ((u64 *) key)[0];
+      result.as_u64[1] =
+	(data64[1 + skip_u64] & ((u64 *) mask)[1]) ^ ((u64 *) key)[1];
+      switch (t->match_n_vectors)
+	{
+	case 5:
+	  result.as_u64[0] |=
+	    (data64[8 + skip_u64] & ((u64 *) mask)[8]) ^ ((u64 *) key)[8];
+	  result.as_u64[1] |=
+	    (data64[9 + skip_u64] & ((u64 *) mask)[9]) ^ ((u64 *) key)[9];
+	  /* FALLTHROUGH */
+	case 4:
+	  result.as_u64[0] |=
+	    (data64[6 + skip_u64] & ((u64 *) mask)[6]) ^ ((u64 *) key)[6];
+	  result.as_u64[1] |=
+	    (data64[7 + skip_u64] & ((u64 *) mask)[7]) ^ ((u64 *) key)[7];
+	  /* FALLTHROUGH */
+	case 3:
+	  result.as_u64[0] |=
+	    (data64[4 + skip_u64] & ((u64 *) mask)[4]) ^ ((u64 *) key)[4];
+	  result.as_u64[1] |=
+	    (data64[5 + skip_u64] & ((u64 *) mask)[5]) ^ ((u64 *) key)[5];
+	  /* FALLTHROUGH */
+	case 2:
+	  result.as_u64[0] |=
+	    (data64[2 + skip_u64] & ((u64 *) mask)[2]) ^ ((u64 *) key)[2];
+	  result.as_u64[1] |=
+	    (data64[3 + skip_u64] & ((u64 *) mask)[3]) ^ ((u64 *) key)[3];
+	  /* FALLTHROUGH */
+	case 1:
+	  break;
+	default:
+	  abort ();
+	}
+
+      if (result.as_u64[0] == 0 && result.as_u64[1] == 0)
+	{
+	  if (PREDICT_TRUE (now))
+	    {
+	      v->hits++;
+	      v->last_heard = now;
+	    }
+	  return (v);
+	}
+
+      v = vnet_classify_entry_at_index (t, v, 1);
+    }
 #endif /* CLIB_HAVE_VEC128 */
-    {
-      u32 skip_u64 = t->skip_n_vectors * 2;
-      u64 *data64 = (u64 *) h;
-      for (i = 0; i < limit; i++)
-	{
-	  key = v->key;
-
-	  result.as_u64[0] =
-	    (data64[0 + skip_u64] & ((u64 *) mask)[0]) ^ ((u64 *) key)[0];
-	  result.as_u64[1] =
-	    (data64[1 + skip_u64] & ((u64 *) mask)[1]) ^ ((u64 *) key)[1];
-	  switch (t->match_n_vectors)
-	    {
-	    case 5:
-	      result.as_u64[0] |=
-		(data64[8 + skip_u64] & ((u64 *) mask)[8]) ^ ((u64 *) key)[8];
-	      result.as_u64[1] |=
-		(data64[9 + skip_u64] & ((u64 *) mask)[9]) ^ ((u64 *) key)[9];
-	      /* FALLTHROUGH */
-	    case 4:
-	      result.as_u64[0] |=
-		(data64[6 + skip_u64] & ((u64 *) mask)[6]) ^ ((u64 *) key)[6];
-	      result.as_u64[1] |=
-		(data64[7 + skip_u64] & ((u64 *) mask)[7]) ^ ((u64 *) key)[7];
-	      /* FALLTHROUGH */
-	    case 3:
-	      result.as_u64[0] |=
-		(data64[4 + skip_u64] & ((u64 *) mask)[4]) ^ ((u64 *) key)[4];
-	      result.as_u64[1] |=
-		(data64[5 + skip_u64] & ((u64 *) mask)[5]) ^ ((u64 *) key)[5];
-	      /* FALLTHROUGH */
-	    case 2:
-	      result.as_u64[0] |=
-		(data64[2 + skip_u64] & ((u64 *) mask)[2]) ^ ((u64 *) key)[2];
-	      result.as_u64[1] |=
-		(data64[3 + skip_u64] & ((u64 *) mask)[3]) ^ ((u64 *) key)[3];
-	      /* FALLTHROUGH */
-	    case 1:
-	      break;
-	    default:
-	      abort ();
-	    }
-
-	  if (result.as_u64[0] == 0 && result.as_u64[1] == 0)
-	    {
-	      if (PREDICT_TRUE (now))
-		{
-		  v->hits++;
-		  v->last_heard = now;
-		}
-	      return (v);
-	    }
-
-	  v = vnet_classify_entry_at_index (t, v, 1);
-	}
-    }
   return 0;
 }
 
diff --git a/src/vnet/l2/l2_rw.c b/src/vnet/l2/l2_rw.c
index 85a53aa..e585140 100644
--- a/src/vnet/l2/l2_rw.c
+++ b/src/vnet/l2/l2_rw.c
@@ -91,68 +91,26 @@
 static_always_inline void
 l2_rw_rewrite (l2_rw_entry_t * rwe, u8 * h)
 {
-  if (U32X4_ALIGNED (h))
+  u32x4u *d = ((u32x4u *) h) + rwe->skip_n_vectors;
+  switch (rwe->rewrite_n_vectors)
     {
-      u32x4 *d = ((u32x4 *) h) + rwe->skip_n_vectors;
-      switch (rwe->rewrite_n_vectors)
-	{
-	case 5:
-	  d[4] = (d[4] & ~rwe->mask[4]) | rwe->value[4];
-	  /* FALLTHROUGH */
-	case 4:
-	  d[3] = (d[3] & ~rwe->mask[3]) | rwe->value[3];
-	  /* FALLTHROUGH */
-	case 3:
-	  d[2] = (d[2] & ~rwe->mask[2]) | rwe->value[2];
-	  /* FALLTHROUGH */
-	case 2:
-	  d[1] = (d[1] & ~rwe->mask[1]) | rwe->value[1];
-	  /* FALLTHROUGH */
-	case 1:
-	  d[0] = (d[0] & ~rwe->mask[0]) | rwe->value[0];
-	  break;
-	default:
-	  abort ();
-	}
-    }
-  else
-    {
-      u64 *d = ((u64 *) h) + rwe->skip_n_vectors * 2;
-      switch (rwe->rewrite_n_vectors)
-	{
-	case 5:
-	  d[8] =
-	    (d[8] & ~(((u64 *) rwe->mask)[8])) | (((u64 *) rwe->value)[8]);
-	  d[9] =
-	    (d[9] & ~(((u64 *) rwe->mask)[9])) | (((u64 *) rwe->value)[9]);
-	  /* FALLTHROUGH */
-	case 4:
-	  d[6] =
-	    (d[6] & ~(((u64 *) rwe->mask)[6])) | (((u64 *) rwe->value)[6]);
-	  d[7] =
-	    (d[7] & ~(((u64 *) rwe->mask)[7])) | (((u64 *) rwe->value)[7]);
-	  /* FALLTHROUGH */
-	case 3:
-	  d[4] =
-	    (d[4] & ~(((u64 *) rwe->mask)[4])) | (((u64 *) rwe->value)[4]);
-	  d[5] =
-	    (d[5] & ~(((u64 *) rwe->mask)[5])) | (((u64 *) rwe->value)[5]);
-	  /* FALLTHROUGH */
-	case 2:
-	  d[2] =
-	    (d[2] & ~(((u64 *) rwe->mask)[2])) | (((u64 *) rwe->value)[2]);
-	  d[3] =
-	    (d[3] & ~(((u64 *) rwe->mask)[3])) | (((u64 *) rwe->value)[3]);
-	  /* FALLTHROUGH */
-	case 1:
-	  d[0] =
-	    (d[0] & ~(((u64 *) rwe->mask)[0])) | (((u64 *) rwe->value)[0]);
-	  d[1] =
-	    (d[1] & ~(((u64 *) rwe->mask)[1])) | (((u64 *) rwe->value)[1]);
-	  break;
-	default:
-	  abort ();
-	}
+    case 5:
+      d[4] = (d[4] & ~rwe->mask[4]) | rwe->value[4];
+      /* FALLTHROUGH */
+    case 4:
+      d[3] = (d[3] & ~rwe->mask[3]) | rwe->value[3];
+      /* FALLTHROUGH */
+    case 3:
+      d[2] = (d[2] & ~rwe->mask[2]) | rwe->value[2];
+      /* FALLTHROUGH */
+    case 2:
+      d[1] = (d[1] & ~rwe->mask[1]) | rwe->value[1];
+      /* FALLTHROUGH */
+    case 1:
+      d[0] = (d[0] & ~rwe->mask[0]) | rwe->value[0];
+      break;
+    default:
+      abort ();
     }
 }
 
diff --git a/src/vppinfra/vector.h b/src/vppinfra/vector.h
index 2b84cc2..906d8d8 100644
--- a/src/vppinfra/vector.h
+++ b/src/vppinfra/vector.h
@@ -76,6 +76,7 @@
 #endif
 
 #define _vector_size(n) __attribute__ ((vector_size (n)))
+#define _vector_size_unaligned(n) __attribute__ ((vector_size (n),  __aligned__ (1)))
 
 #define foreach_vec64i  _(i,8,8)  _(i,16,4)  _(i,32,2)
 #define foreach_vec64u  _(u,8,8)  _(u,16,4)  _(u,32,2)
@@ -111,6 +112,7 @@
 /* Type Definitions */
 #define _(t,s,c) \
 typedef t##s t##s##x##c _vector_size (s/8*c);	\
+typedef t##s t##s##x##c##u _vector_size_unaligned (s/8*c);	\
 typedef union {	  \
   t##s##x##c as_##t##s##x##c;	\
   t##s as_##t##s[c];	  \