classify: use vector code even when data is not aligned
Type: feature
Change-Id: I8f5f4841965beb13ebc8c2a37ce0dc331c920109
Signed-off-by: Damjan Marion <damarion@cisco.com>
diff --git a/src/vnet/classify/vnet_classify.h b/src/vnet/classify/vnet_classify.h
index 620ef9a..35a5db3 100644
--- a/src/vnet/classify/vnet_classify.h
+++ b/src/vnet/classify/vnet_classify.h
@@ -28,8 +28,6 @@
#define CLASSIFY_TRACE 0
-#define U32X4_ALIGNED(p) PREDICT_TRUE((((intptr_t)p) & 0xf) == 0)
-
/*
* Classify table option to process packets
* CLASSIFY_FLAG_USE_CURR_DATA:
@@ -218,62 +216,57 @@
ASSERT (t);
mask = t->mask;
#ifdef CLIB_HAVE_VEC128
- if (U32X4_ALIGNED (h))
- { //SSE can't handle unaligned data
- u32x4 *data = (u32x4 *) h;
- xor_sum.as_u32x4 = data[0 + t->skip_n_vectors] & mask[0];
- switch (t->match_n_vectors)
- {
- case 5:
- xor_sum.as_u32x4 ^= data[4 + t->skip_n_vectors] & mask[4];
- /* FALLTHROUGH */
- case 4:
- xor_sum.as_u32x4 ^= data[3 + t->skip_n_vectors] & mask[3];
- /* FALLTHROUGH */
- case 3:
- xor_sum.as_u32x4 ^= data[2 + t->skip_n_vectors] & mask[2];
- /* FALLTHROUGH */
- case 2:
- xor_sum.as_u32x4 ^= data[1 + t->skip_n_vectors] & mask[1];
- /* FALLTHROUGH */
- case 1:
- break;
- default:
- abort ();
- }
- }
- else
-#endif /* CLIB_HAVE_VEC128 */
+ u32x4u *data = (u32x4u *) h;
+ xor_sum.as_u32x4 = data[0 + t->skip_n_vectors] & mask[0];
+ switch (t->match_n_vectors)
{
- u32 skip_u64 = t->skip_n_vectors * 2;
- u64 *data64 = (u64 *) h;
- xor_sum.as_u64[0] = data64[0 + skip_u64] & ((u64 *) mask)[0];
- xor_sum.as_u64[1] = data64[1 + skip_u64] & ((u64 *) mask)[1];
- switch (t->match_n_vectors)
- {
- case 5:
- xor_sum.as_u64[0] ^= data64[8 + skip_u64] & ((u64 *) mask)[8];
- xor_sum.as_u64[1] ^= data64[9 + skip_u64] & ((u64 *) mask)[9];
- /* FALLTHROUGH */
- case 4:
- xor_sum.as_u64[0] ^= data64[6 + skip_u64] & ((u64 *) mask)[6];
- xor_sum.as_u64[1] ^= data64[7 + skip_u64] & ((u64 *) mask)[7];
- /* FALLTHROUGH */
- case 3:
- xor_sum.as_u64[0] ^= data64[4 + skip_u64] & ((u64 *) mask)[4];
- xor_sum.as_u64[1] ^= data64[5 + skip_u64] & ((u64 *) mask)[5];
- /* FALLTHROUGH */
- case 2:
- xor_sum.as_u64[0] ^= data64[2 + skip_u64] & ((u64 *) mask)[2];
- xor_sum.as_u64[1] ^= data64[3 + skip_u64] & ((u64 *) mask)[3];
- /* FALLTHROUGH */
- case 1:
- break;
-
- default:
- abort ();
- }
+ case 5:
+ xor_sum.as_u32x4 ^= data[4 + t->skip_n_vectors] & mask[4];
+ /* FALLTHROUGH */
+ case 4:
+ xor_sum.as_u32x4 ^= data[3 + t->skip_n_vectors] & mask[3];
+ /* FALLTHROUGH */
+ case 3:
+ xor_sum.as_u32x4 ^= data[2 + t->skip_n_vectors] & mask[2];
+ /* FALLTHROUGH */
+ case 2:
+ xor_sum.as_u32x4 ^= data[1 + t->skip_n_vectors] & mask[1];
+ /* FALLTHROUGH */
+ case 1:
+ break;
+ default:
+ abort ();
}
+#else
+ u32 skip_u64 = t->skip_n_vectors * 2;
+ u64 *data64 = (u64 *) h;
+ xor_sum.as_u64[0] = data64[0 + skip_u64] & ((u64 *) mask)[0];
+ xor_sum.as_u64[1] = data64[1 + skip_u64] & ((u64 *) mask)[1];
+ switch (t->match_n_vectors)
+ {
+ case 5:
+ xor_sum.as_u64[0] ^= data64[8 + skip_u64] & ((u64 *) mask)[8];
+ xor_sum.as_u64[1] ^= data64[9 + skip_u64] & ((u64 *) mask)[9];
+ /* FALLTHROUGH */
+ case 4:
+ xor_sum.as_u64[0] ^= data64[6 + skip_u64] & ((u64 *) mask)[6];
+ xor_sum.as_u64[1] ^= data64[7 + skip_u64] & ((u64 *) mask)[7];
+ /* FALLTHROUGH */
+ case 3:
+ xor_sum.as_u64[0] ^= data64[4 + skip_u64] & ((u64 *) mask)[4];
+ xor_sum.as_u64[1] ^= data64[5 + skip_u64] & ((u64 *) mask)[5];
+ /* FALLTHROUGH */
+ case 2:
+ xor_sum.as_u64[0] ^= data64[2 + skip_u64] & ((u64 *) mask)[2];
+ xor_sum.as_u64[1] ^= data64[3 + skip_u64] & ((u64 *) mask)[3];
+ /* FALLTHROUGH */
+ case 1:
+ break;
+
+ default:
+ abort ();
+ }
+#endif /* CLIB_HAVE_VEC128 */
return clib_xxhash (xor_sum.as_u64[0] ^ xor_sum.as_u64[1]);
}
@@ -392,107 +385,98 @@
v = vnet_classify_entry_at_index (t, v, value_index);
#ifdef CLIB_HAVE_VEC128
- if (U32X4_ALIGNED (h))
+ u32x4u *data = (u32x4u *) h;
+ for (i = 0; i < limit; i++)
{
- u32x4 *data = (u32x4 *) h;
- for (i = 0; i < limit; i++)
+ key = v->key;
+ result.as_u32x4 = (data[0 + t->skip_n_vectors] & mask[0]) ^ key[0];
+ switch (t->match_n_vectors)
{
- key = v->key;
- result.as_u32x4 = (data[0 + t->skip_n_vectors] & mask[0]) ^ key[0];
- switch (t->match_n_vectors)
- {
- case 5:
- result.as_u32x4 |=
- (data[4 + t->skip_n_vectors] & mask[4]) ^ key[4];
- /* FALLTHROUGH */
- case 4:
- result.as_u32x4 |=
- (data[3 + t->skip_n_vectors] & mask[3]) ^ key[3];
- /* FALLTHROUGH */
- case 3:
- result.as_u32x4 |=
- (data[2 + t->skip_n_vectors] & mask[2]) ^ key[2];
- /* FALLTHROUGH */
- case 2:
- result.as_u32x4 |=
- (data[1 + t->skip_n_vectors] & mask[1]) ^ key[1];
- /* FALLTHROUGH */
- case 1:
- break;
- default:
- abort ();
- }
-
- if (u32x4_zero_byte_mask (result.as_u32x4) == 0xffff)
- {
- if (PREDICT_TRUE (now))
- {
- v->hits++;
- v->last_heard = now;
- }
- return (v);
- }
- v = vnet_classify_entry_at_index (t, v, 1);
+ case 5:
+ result.as_u32x4 |= (data[4 + t->skip_n_vectors] & mask[4]) ^ key[4];
+ /* FALLTHROUGH */
+ case 4:
+ result.as_u32x4 |= (data[3 + t->skip_n_vectors] & mask[3]) ^ key[3];
+ /* FALLTHROUGH */
+ case 3:
+ result.as_u32x4 |= (data[2 + t->skip_n_vectors] & mask[2]) ^ key[2];
+ /* FALLTHROUGH */
+ case 2:
+ result.as_u32x4 |= (data[1 + t->skip_n_vectors] & mask[1]) ^ key[1];
+ /* FALLTHROUGH */
+ case 1:
+ break;
+ default:
+ abort ();
}
+
+ if (u32x4_zero_byte_mask (result.as_u32x4) == 0xffff)
+ {
+ if (PREDICT_TRUE (now))
+ {
+ v->hits++;
+ v->last_heard = now;
+ }
+ return (v);
+ }
+ v = vnet_classify_entry_at_index (t, v, 1);
}
- else
+#else
+ u32 skip_u64 = t->skip_n_vectors * 2;
+ u64 *data64 = (u64 *) h;
+ for (i = 0; i < limit; i++)
+ {
+ key = v->key;
+
+ result.as_u64[0] =
+ (data64[0 + skip_u64] & ((u64 *) mask)[0]) ^ ((u64 *) key)[0];
+ result.as_u64[1] =
+ (data64[1 + skip_u64] & ((u64 *) mask)[1]) ^ ((u64 *) key)[1];
+ switch (t->match_n_vectors)
+ {
+ case 5:
+ result.as_u64[0] |=
+ (data64[8 + skip_u64] & ((u64 *) mask)[8]) ^ ((u64 *) key)[8];
+ result.as_u64[1] |=
+ (data64[9 + skip_u64] & ((u64 *) mask)[9]) ^ ((u64 *) key)[9];
+ /* FALLTHROUGH */
+ case 4:
+ result.as_u64[0] |=
+ (data64[6 + skip_u64] & ((u64 *) mask)[6]) ^ ((u64 *) key)[6];
+ result.as_u64[1] |=
+ (data64[7 + skip_u64] & ((u64 *) mask)[7]) ^ ((u64 *) key)[7];
+ /* FALLTHROUGH */
+ case 3:
+ result.as_u64[0] |=
+ (data64[4 + skip_u64] & ((u64 *) mask)[4]) ^ ((u64 *) key)[4];
+ result.as_u64[1] |=
+ (data64[5 + skip_u64] & ((u64 *) mask)[5]) ^ ((u64 *) key)[5];
+ /* FALLTHROUGH */
+ case 2:
+ result.as_u64[0] |=
+ (data64[2 + skip_u64] & ((u64 *) mask)[2]) ^ ((u64 *) key)[2];
+ result.as_u64[1] |=
+ (data64[3 + skip_u64] & ((u64 *) mask)[3]) ^ ((u64 *) key)[3];
+ /* FALLTHROUGH */
+ case 1:
+ break;
+ default:
+ abort ();
+ }
+
+ if (result.as_u64[0] == 0 && result.as_u64[1] == 0)
+ {
+ if (PREDICT_TRUE (now))
+ {
+ v->hits++;
+ v->last_heard = now;
+ }
+ return (v);
+ }
+
+ v = vnet_classify_entry_at_index (t, v, 1);
+ }
#endif /* CLIB_HAVE_VEC128 */
- {
- u32 skip_u64 = t->skip_n_vectors * 2;
- u64 *data64 = (u64 *) h;
- for (i = 0; i < limit; i++)
- {
- key = v->key;
-
- result.as_u64[0] =
- (data64[0 + skip_u64] & ((u64 *) mask)[0]) ^ ((u64 *) key)[0];
- result.as_u64[1] =
- (data64[1 + skip_u64] & ((u64 *) mask)[1]) ^ ((u64 *) key)[1];
- switch (t->match_n_vectors)
- {
- case 5:
- result.as_u64[0] |=
- (data64[8 + skip_u64] & ((u64 *) mask)[8]) ^ ((u64 *) key)[8];
- result.as_u64[1] |=
- (data64[9 + skip_u64] & ((u64 *) mask)[9]) ^ ((u64 *) key)[9];
- /* FALLTHROUGH */
- case 4:
- result.as_u64[0] |=
- (data64[6 + skip_u64] & ((u64 *) mask)[6]) ^ ((u64 *) key)[6];
- result.as_u64[1] |=
- (data64[7 + skip_u64] & ((u64 *) mask)[7]) ^ ((u64 *) key)[7];
- /* FALLTHROUGH */
- case 3:
- result.as_u64[0] |=
- (data64[4 + skip_u64] & ((u64 *) mask)[4]) ^ ((u64 *) key)[4];
- result.as_u64[1] |=
- (data64[5 + skip_u64] & ((u64 *) mask)[5]) ^ ((u64 *) key)[5];
- /* FALLTHROUGH */
- case 2:
- result.as_u64[0] |=
- (data64[2 + skip_u64] & ((u64 *) mask)[2]) ^ ((u64 *) key)[2];
- result.as_u64[1] |=
- (data64[3 + skip_u64] & ((u64 *) mask)[3]) ^ ((u64 *) key)[3];
- /* FALLTHROUGH */
- case 1:
- break;
- default:
- abort ();
- }
-
- if (result.as_u64[0] == 0 && result.as_u64[1] == 0)
- {
- if (PREDICT_TRUE (now))
- {
- v->hits++;
- v->last_heard = now;
- }
- return (v);
- }
-
- v = vnet_classify_entry_at_index (t, v, 1);
- }
- }
return 0;
}
diff --git a/src/vnet/l2/l2_rw.c b/src/vnet/l2/l2_rw.c
index 85a53aa..e585140 100644
--- a/src/vnet/l2/l2_rw.c
+++ b/src/vnet/l2/l2_rw.c
@@ -91,68 +91,26 @@
static_always_inline void
l2_rw_rewrite (l2_rw_entry_t * rwe, u8 * h)
{
- if (U32X4_ALIGNED (h))
+ u32x4u *d = ((u32x4u *) h) + rwe->skip_n_vectors;
+ switch (rwe->rewrite_n_vectors)
{
- u32x4 *d = ((u32x4 *) h) + rwe->skip_n_vectors;
- switch (rwe->rewrite_n_vectors)
- {
- case 5:
- d[4] = (d[4] & ~rwe->mask[4]) | rwe->value[4];
- /* FALLTHROUGH */
- case 4:
- d[3] = (d[3] & ~rwe->mask[3]) | rwe->value[3];
- /* FALLTHROUGH */
- case 3:
- d[2] = (d[2] & ~rwe->mask[2]) | rwe->value[2];
- /* FALLTHROUGH */
- case 2:
- d[1] = (d[1] & ~rwe->mask[1]) | rwe->value[1];
- /* FALLTHROUGH */
- case 1:
- d[0] = (d[0] & ~rwe->mask[0]) | rwe->value[0];
- break;
- default:
- abort ();
- }
- }
- else
- {
- u64 *d = ((u64 *) h) + rwe->skip_n_vectors * 2;
- switch (rwe->rewrite_n_vectors)
- {
- case 5:
- d[8] =
- (d[8] & ~(((u64 *) rwe->mask)[8])) | (((u64 *) rwe->value)[8]);
- d[9] =
- (d[9] & ~(((u64 *) rwe->mask)[9])) | (((u64 *) rwe->value)[9]);
- /* FALLTHROUGH */
- case 4:
- d[6] =
- (d[6] & ~(((u64 *) rwe->mask)[6])) | (((u64 *) rwe->value)[6]);
- d[7] =
- (d[7] & ~(((u64 *) rwe->mask)[7])) | (((u64 *) rwe->value)[7]);
- /* FALLTHROUGH */
- case 3:
- d[4] =
- (d[4] & ~(((u64 *) rwe->mask)[4])) | (((u64 *) rwe->value)[4]);
- d[5] =
- (d[5] & ~(((u64 *) rwe->mask)[5])) | (((u64 *) rwe->value)[5]);
- /* FALLTHROUGH */
- case 2:
- d[2] =
- (d[2] & ~(((u64 *) rwe->mask)[2])) | (((u64 *) rwe->value)[2]);
- d[3] =
- (d[3] & ~(((u64 *) rwe->mask)[3])) | (((u64 *) rwe->value)[3]);
- /* FALLTHROUGH */
- case 1:
- d[0] =
- (d[0] & ~(((u64 *) rwe->mask)[0])) | (((u64 *) rwe->value)[0]);
- d[1] =
- (d[1] & ~(((u64 *) rwe->mask)[1])) | (((u64 *) rwe->value)[1]);
- break;
- default:
- abort ();
- }
+ case 5:
+ d[4] = (d[4] & ~rwe->mask[4]) | rwe->value[4];
+ /* FALLTHROUGH */
+ case 4:
+ d[3] = (d[3] & ~rwe->mask[3]) | rwe->value[3];
+ /* FALLTHROUGH */
+ case 3:
+ d[2] = (d[2] & ~rwe->mask[2]) | rwe->value[2];
+ /* FALLTHROUGH */
+ case 2:
+ d[1] = (d[1] & ~rwe->mask[1]) | rwe->value[1];
+ /* FALLTHROUGH */
+ case 1:
+ d[0] = (d[0] & ~rwe->mask[0]) | rwe->value[0];
+ break;
+ default:
+ abort ();
}
}
diff --git a/src/vppinfra/vector.h b/src/vppinfra/vector.h
index 2b84cc2..906d8d8 100644
--- a/src/vppinfra/vector.h
+++ b/src/vppinfra/vector.h
@@ -76,6 +76,7 @@
#endif
#define _vector_size(n) __attribute__ ((vector_size (n)))
+#define _vector_size_unaligned(n) __attribute__ ((vector_size (n), __aligned__ (1)))
#define foreach_vec64i _(i,8,8) _(i,16,4) _(i,32,2)
#define foreach_vec64u _(u,8,8) _(u,16,4) _(u,32,2)
@@ -111,6 +112,7 @@
/* Type Definitions */
#define _(t,s,c) \
typedef t##s t##s##x##c _vector_size (s/8*c); \
+typedef t##s t##s##x##c##u _vector_size_unaligned (s/8*c); \
typedef union { \
t##s##x##c as_##t##s##x##c; \
t##s as_##t##s[c]; \