vppinfra: fix x86 packs / packus wrappers
They both take signed value as input.
Type: fix
Change-Id: If3d8ec4e0b1c02d7d65262bdd9db49ff7fbfef39
Signed-off-by: Damjan Marion <damarion@cisco.com>
diff --git a/src/vppinfra/vector_avx2.h b/src/vppinfra/vector_avx2.h
index 584bd20..f38a3bd 100644
--- a/src/vppinfra/vector_avx2.h
+++ b/src/vppinfra/vector_avx2.h
@@ -105,17 +105,19 @@
#undef _
/* *INDENT-ON* */
-always_inline u8x32
-u16x16_pack (u16x16 lo, u16x16 hi)
-{
- return (u8x32) _mm256_packus_epi16 ((__m256i) lo, (__m256i) hi);
-}
+/* 256 bit packs. */
+#define _(f, t, fn) \
+ always_inline t t##_pack (f lo, f hi) \
+ { \
+ return (t) fn ((__m256i) lo, (__m256i) hi); \
+ }
-always_inline i8x32
-i16x16_pack (i16x16 lo, i16x16 hi)
-{
- return (i8x32) _mm256_packs_epi16 ((__m256i) lo, (__m256i) hi);
-}
+_ (i16x16, i8x32, _mm256_packs_epi16)
+_ (i16x16, u8x32, _mm256_packus_epi16)
+_ (i32x8, i16x16, _mm256_packs_epi32)
+_ (i32x8, u16x16, _mm256_packus_epi32)
+
+#undef _
static_always_inline u32
u8x32_msb_mask (u8x32 v)
diff --git a/src/vppinfra/vector_avx512.h b/src/vppinfra/vector_avx512.h
index 2f5763e..3a01c1e 100644
--- a/src/vppinfra/vector_avx512.h
+++ b/src/vppinfra/vector_avx512.h
@@ -85,6 +85,19 @@
return (u32) _mm512_movepi16_mask ((__m512i) v);
}
+/* 512-bit packs */
+#define _(f, t, fn) \
+ always_inline t t##_pack (f lo, f hi) \
+ { \
+ return (t) fn ((__m512i) lo, (__m512i) hi); \
+ }
+
+_ (i16x32, i8x64, _mm512_packs_epi16)
+_ (i16x32, u8x64, _mm512_packus_epi16)
+_ (i32x16, i16x32, _mm512_packs_epi32)
+_ (i32x16, u16x32, _mm512_packus_epi32)
+#undef _
+
static_always_inline u32x16
u32x16_byte_swap (u32x16 v)
{
diff --git a/src/vppinfra/vector_funcs.h b/src/vppinfra/vector_funcs.h
index 2b02d9e..5c446a5 100644
--- a/src/vppinfra/vector_funcs.h
+++ b/src/vppinfra/vector_funcs.h
@@ -27,37 +27,36 @@
u16x16u *av = (u16x16u *) a;
i8x32 x;
- x = i16x16_pack (v16 == av[0], v16 == av[1]);
+ x = i8x32_pack (v16 == av[0], v16 == av[1]);
mask = i8x32_msb_mask ((i8x32) u64x4_permute (x, 0, 2, 1, 3));
- x = i16x16_pack (v16 == av[2], v16 == av[3]);
+ x = i8x32_pack (v16 == av[2], v16 == av[3]);
mask |= (u64) i8x32_msb_mask ((i8x32) u64x4_permute (x, 0, 2, 1, 3)) << 32;
#elif defined(CLIB_HAVE_VEC128) && defined(__ARM_NEON)
- u16x8 idx8 = u16x8_splat (v);
+ u16x8 v8 = u16x8_splat (v);
u16x8 m = { 1, 2, 4, 8, 16, 32, 64, 128 };
u16x8u *av = (u16x8u *) a;
- /* compare each u16 elemment with idx8, result gives 0xffff in each element
+ /* compare each u16 elemment with v8, result gives 0xffff in each element
of the resulting vector if comparison result is true.
Bitwise AND with m will give us one bit set for true result and offset
of that bit represend element index. Finally vaddvq_u16() gives us sum
of all elements of the vector which will give us u8 bitmap. */
- mask = ((u64) vaddvq_u16 ((av[0] == idx8) & m) |
- (u64) vaddvq_u16 ((av[1] == idx8) & m) << 8 |
- (u64) vaddvq_u16 ((av[2] == idx8) & m) << 16 |
- (u64) vaddvq_u16 ((av[3] == idx8) & m) << 24 |
- (u64) vaddvq_u16 ((av[4] == idx8) & m) << 32 |
- (u64) vaddvq_u16 ((av[5] == idx8) & m) << 40 |
- (u64) vaddvq_u16 ((av[6] == idx8) & m) << 48 |
- (u64) vaddvq_u16 ((av[7] == idx8) & m) << 56);
+ mask = ((u64) vaddvq_u16 ((av[0] == v8) & m) |
+ (u64) vaddvq_u16 ((av[1] == v8) & m) << 8 |
+ (u64) vaddvq_u16 ((av[2] == v8) & m) << 16 |
+ (u64) vaddvq_u16 ((av[3] == v8) & m) << 24 |
+ (u64) vaddvq_u16 ((av[4] == v8) & m) << 32 |
+ (u64) vaddvq_u16 ((av[5] == v8) & m) << 40 |
+ (u64) vaddvq_u16 ((av[6] == v8) & m) << 48 |
+ (u64) vaddvq_u16 ((av[7] == v8) & m) << 56);
#elif defined(CLIB_HAVE_VEC128) && defined(CLIB_HAVE_VEC128_MSB_MASK)
- u16x8 idx8 = u16x8_splat (v);
+ u16x8 v8 = u16x8_splat (v);
u16x8u *av = (u16x8u *) a;
- mask =
- ((u64) i8x16_msb_mask (i16x8_pack (idx8 == av[0], idx8 == av[1])) |
- (u64) i8x16_msb_mask (i16x8_pack (idx8 == av[2], idx8 == av[3])) << 16 |
- (u64) i8x16_msb_mask (i16x8_pack (idx8 == av[4], idx8 == av[5])) << 32 |
- (u64) i8x16_msb_mask (i16x8_pack (idx8 == av[6], idx8 == av[7])) << 48);
+ mask = ((u64) i8x16_msb_mask (i8x16_pack (v8 == av[0], v8 == av[1])) |
+ (u64) i8x16_msb_mask (i8x16_pack (v8 == av[2], v8 == av[3])) << 16 |
+ (u64) i8x16_msb_mask (i8x16_pack (v8 == av[4], v8 == av[5])) << 32 |
+ (u64) i8x16_msb_mask (i8x16_pack (v8 == av[6], v8 == av[7])) << 48);
#else
for (int i = 0; i < 64; i++)
if (a[i] == v)
diff --git a/src/vppinfra/vector_sse42.h b/src/vppinfra/vector_sse42.h
index f86fad3..1bdb34b 100644
--- a/src/vppinfra/vector_sse42.h
+++ b/src/vppinfra/vector_sse42.h
@@ -184,74 +184,18 @@
}
/* 128 bit packs. */
-always_inline u8x16
-u16x8_pack (u16x8 lo, u16x8 hi)
-{
- return (u8x16) _mm_packus_epi16 ((__m128i) lo, (__m128i) hi);
-}
+#define _(f, t, fn) \
+ always_inline t t##_pack (f lo, f hi) \
+ { \
+ return (t) fn ((__m128i) lo, (__m128i) hi); \
+ }
-always_inline i8x16
-i16x8_pack (i16x8 lo, i16x8 hi)
-{
- return (i8x16) _mm_packs_epi16 ((__m128i) lo, (__m128i) hi);
-}
+_ (i16x8, i8x16, _mm_packs_epi16)
+_ (i16x8, u8x16, _mm_packus_epi16)
+_ (i32x4, i16x8, _mm_packs_epi32)
+_ (i32x4, u16x8, _mm_packus_epi32)
-always_inline u16x8
-u32x4_pack (u32x4 lo, u32x4 hi)
-{
- return (u16x8) _mm_packs_epi32 ((__m128i) lo, (__m128i) hi);
-}
-
-/* 64 bit packs. */
-always_inline u8x8
-u16x4_pack (u16x4 lo, u16x4 hi)
-{
- return (u8x8) _m_packuswb ((__m64) lo, (__m64) hi);
-}
-
-always_inline i8x8
-i16x4_pack (i16x4 lo, i16x4 hi)
-{
- return (i8x8) _m_packsswb ((__m64) lo, (__m64) hi);
-}
-
-always_inline u16x4
-u32x2_pack (u32x2 lo, u32x2 hi)
-{
- return (u16x4) _m_packssdw ((__m64) lo, (__m64) hi);
-}
-
-always_inline i16x4
-i32x2_pack (i32x2 lo, i32x2 hi)
-{
- return (i16x4) _m_packssdw ((__m64) lo, (__m64) hi);
-}
-
-#ifndef __ICC
-always_inline u64x2
-u64x2_read_lo (u64x2 x, u64 * a)
-{
- return (u64x2) _mm_loadl_pi ((__m128) x, (__m64 *) a);
-}
-
-always_inline u64x2
-u64x2_read_hi (u64x2 x, u64 * a)
-{
- return (u64x2) _mm_loadh_pi ((__m128) x, (__m64 *) a);
-}
-
-always_inline void
-u64x2_write_lo (u64x2 x, u64 * a)
-{
- _mm_storel_pi ((__m64 *) a, (__m128) x);
-}
-
-always_inline void
-u64x2_write_hi (u64x2 x, u64 * a)
-{
- _mm_storeh_pi ((__m64 *) a, (__m128) x);
-}
-#endif
+#undef _
#define _signed_binop(n,m,f,g) \
/* Unsigned */ \