vppinfra: new vectorized ip checksum functions incl. csum_and_copy
Type: improvement
Change-Id: Id5810b7f4a6d6e4ce16b73c235b50db5d475ebf7
Signed-off-by: Damjan Marion <damarion@cisco.com>
diff --git a/src/plugins/avf/output.c b/src/plugins/avf/output.c
index 4cc9d5a..8cc76a6 100644
--- a/src/plugins/avf/output.c
+++ b/src/plugins/avf/output.c
@@ -19,6 +19,7 @@
#include <vlib/unix/unix.h>
#include <vlib/pci/pci.h>
#include <vppinfra/ring.h>
+#include <vppinfra/vector/ip_csum.h>
#include <vnet/ethernet/ethernet.h>
#include <vnet/ip/ip4_packet.h>
@@ -110,7 +111,7 @@
is_tso ? 0 :
clib_host_to_net_u16 (clib_net_to_host_u16 (ip4->length) -
(l4_hdr_offset - l3_hdr_offset));
- sum = ~ip_csum (&psh, sizeof (psh));
+ sum = ~clib_ip_csum ((u8 *) &psh, sizeof (psh));
}
else
{
@@ -119,7 +120,7 @@
psh.dst = ip6->dst_address;
psh.proto = clib_host_to_net_u32 ((u32) ip6->protocol);
psh.l4len = is_tso ? 0 : ip6->payload_length;
- sum = ~ip_csum (&psh, sizeof (psh));
+ sum = ~clib_ip_csum ((u8 *) &psh, sizeof (psh));
}
/* ip_csum does a byte swap for some reason... */
diff --git a/src/vnet/ip/ip4_input.h b/src/vnet/ip/ip4_input.h
index 383ef31..53948d6 100644
--- a/src/vnet/ip/ip4_input.h
+++ b/src/vnet/ip/ip4_input.h
@@ -42,6 +42,7 @@
#include <vnet/ip/ip.h>
#include <vnet/ethernet/ethernet.h>
+#include <vppinfra/vector/ip_csum.h>
typedef enum
{
@@ -63,15 +64,16 @@
if ((ip->ip_version_and_header_length & 0xf) != 5)
{
*error = IP4_ERROR_OPTIONS;
- if (verify_checksum && ip_csum (ip, ip4_header_bytes (ip)) != 0)
+ if (verify_checksum &&
+ clib_ip_csum ((u8 *) ip, ip4_header_bytes (ip)) != 0)
*error = IP4_ERROR_BAD_CHECKSUM;
}
else
*error = IP4_ERROR_VERSION;
}
- else
- if (PREDICT_FALSE (verify_checksum &&
- ip_csum (ip, sizeof (ip4_header_t)) != 0))
+ else if (PREDICT_FALSE (verify_checksum &&
+ clib_ip_csum ((u8 *) ip, sizeof (ip4_header_t)) !=
+ 0))
*error = IP4_ERROR_BAD_CHECKSUM;
}
diff --git a/src/vnet/ip/ip_packet.h b/src/vnet/ip/ip_packet.h
index 837b3df..04cf9f1 100755
--- a/src/vnet/ip/ip_packet.h
+++ b/src/vnet/ip/ip_packet.h
@@ -149,98 +149,6 @@
extern u8 *format_ip_ecn (u8 * s, va_list * va);
-/* IP checksum support. */
-
-static_always_inline u16
-ip_csum (void *data, u16 n_left)
-{
- u32 sum;
-#ifdef CLIB_HAVE_VEC256
- u16x16 v1, v2;
- u32x8 zero = { 0 };
- u32x8 sum8 = { 0 };
- u32x4 sum4;
-#endif
-
- /* if there is odd number of bytes, pad by zero and store in sum */
- sum = (n_left & 1) ? ((u8 *) data)[n_left - 1] << 8 : 0;
-
- /* we deal with words */
- n_left >>= 1;
-
-#ifdef CLIB_HAVE_VEC256
- while (n_left >= 32)
- {
- v1 = u16x16_load_unaligned (data);
- v2 = u16x16_load_unaligned (data + 32);
-
-#ifdef CLIB_ARCH_IS_LITTLE_ENDIAN
- v1 = u16x16_byte_swap (v1);
- v2 = u16x16_byte_swap (v2);
-#endif
- sum8 += u32x8_from_u16x8 (u16x16_extract_lo (v1));
- sum8 += u32x8_from_u16x8 (u16x16_extract_hi (v1));
- sum8 += u32x8_from_u16x8 (u16x16_extract_lo (v2));
- sum8 += u32x8_from_u16x8 (u16x16_extract_hi (v2));
- n_left -= 32;
- data += 64;
- }
-
- if (n_left >= 16)
- {
- v1 = u16x16_load_unaligned (data);
-#ifdef CLIB_ARCH_IS_LITTLE_ENDIAN
- v1 = u16x16_byte_swap (v1);
-#endif
- sum8 += u32x8_from_u16x8 (u16x16_extract_lo (v1));
- sum8 += u32x8_from_u16x8 (u16x16_extract_hi (v1));
- n_left -= 16;
- data += 32;
- }
-
- if (n_left)
- {
- v1 = u16x16_load_unaligned (data);
-#ifdef CLIB_ARCH_IS_LITTLE_ENDIAN
- v1 = u16x16_byte_swap (v1);
-#endif
- v1 = u16x16_mask_last (v1, 16 - n_left);
- sum8 += u32x8_from_u16x8 (u16x16_extract_lo (v1));
- sum8 += u32x8_from_u16x8 (u16x16_extract_hi (v1));
- }
-
- sum8 = u32x8_hadd (sum8, zero);
- sum4 = u32x8_extract_lo (sum8) + u32x8_extract_hi (sum8);
- sum += sum4[0] + sum4[1];
-
-#else
- /* scalar version */
- while (n_left >= 8)
- {
- sum += clib_net_to_host_u16 (*((u16 *) data + 0));
- sum += clib_net_to_host_u16 (*((u16 *) data + 1));
- sum += clib_net_to_host_u16 (*((u16 *) data + 2));
- sum += clib_net_to_host_u16 (*((u16 *) data + 3));
- sum += clib_net_to_host_u16 (*((u16 *) data + 4));
- sum += clib_net_to_host_u16 (*((u16 *) data + 5));
- sum += clib_net_to_host_u16 (*((u16 *) data + 6));
- sum += clib_net_to_host_u16 (*((u16 *) data + 7));
- n_left -= 8;
- data += 16;
- }
- while (n_left)
- {
- sum += clib_net_to_host_u16 (*(u16 *) data);
- n_left -= 1;
- data += 2;
- }
-#endif
-
- sum = (sum & 0xffff) + (sum >> 16);
- sum = (sum & 0xffff) + (sum >> 16);
- return ~((u16) sum);
-}
-
/* Incremental checksum update. */
typedef uword ip_csum_t;
diff --git a/src/vnet/ip/ip_psh_cksum.h b/src/vnet/ip/ip_psh_cksum.h
index eaac401..8723749 100644
--- a/src/vnet/ip/ip_psh_cksum.h
+++ b/src/vnet/ip/ip_psh_cksum.h
@@ -7,6 +7,7 @@
#define included_ip_psh_cksum_h
#include <vnet/ip/ip.h>
+#include <vppinfra/vector/ip_csum.h>
typedef struct _ip4_psh
{
@@ -37,7 +38,8 @@
psh.proto = ip4->protocol;
psh.l4len = clib_host_to_net_u16 (clib_net_to_host_u16 (ip4->length) -
sizeof (ip4_header_t));
- return ~clib_net_to_host_u16 (ip_csum (&psh, sizeof (ip4_psh_t)));
+ return ~clib_net_to_host_u16 (
+ clib_ip_csum ((u8 *) &psh, sizeof (ip4_psh_t)));
}
static_always_inline u16
@@ -48,7 +50,8 @@
psh.dst = ip6->dst_address;
psh.l4len = ip6->payload_length;
psh.proto = clib_host_to_net_u32 ((u32) ip6->protocol);
- return ~clib_net_to_host_u16 (ip_csum (&psh, sizeof (ip6_psh_t)));
+ return ~clib_net_to_host_u16 (
+ clib_ip_csum ((u8 *) &psh, sizeof (ip6_psh_t)));
}
#endif /* included_ip_psh_cksum_h */
diff --git a/src/vppinfra/CMakeLists.txt b/src/vppinfra/CMakeLists.txt
index 9f407a1..7a73fe5 100644
--- a/src/vppinfra/CMakeLists.txt
+++ b/src/vppinfra/CMakeLists.txt
@@ -194,6 +194,7 @@
vector/compress.h
vector/count_equal.h
vector/index_to_ptr.h
+ vector/ip_csum.h
vector/mask_compare.h
vector.h
vector_neon.h
@@ -275,6 +276,7 @@
vector/test/compress.c
vector/test/count_equal.c
vector/test/index_to_ptr.c
+ vector/test/ip_csum.c
vector/test/mask_compare.c
vector/test/memcpy_x86_64.c
)
diff --git a/src/vppinfra/vector/ip_csum.h b/src/vppinfra/vector/ip_csum.h
new file mode 100644
index 0000000..2cea9b4
--- /dev/null
+++ b/src/vppinfra/vector/ip_csum.h
@@ -0,0 +1,339 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Cisco Systems, Inc.
+ */
+
+#ifndef included_vector_ip_csum_h
+#define included_vector_ip_csum_h
+#include <vppinfra/clib.h>
+typedef struct
+{
+ u64 sum;
+ u8 odd;
+} clib_ip_csum_t;
+
+#if defined(CLIB_HAVE_VEC128)
+static_always_inline u64x2
+clib_ip_csum_cvt_and_add_4 (u32x4 v)
+{
+ return ((u64x2) u32x4_interleave_lo ((u32x4) v, u32x4_zero ()) +
+ (u64x2) u32x4_interleave_hi ((u32x4) v, u32x4_zero ()));
+}
+static_always_inline u64
+clib_ip_csum_hadd_2 (u64x2 v)
+{
+ return v[0] + v[1];
+}
+#endif
+
+#if defined(CLIB_HAVE_VEC256)
+static_always_inline u64x4
+clib_ip_csum_cvt_and_add_8 (u32x8 v)
+{
+ return ((u64x4) u32x8_interleave_lo ((u32x8) v, u32x8_zero ()) +
+ (u64x4) u32x8_interleave_hi ((u32x8) v, u32x8_zero ()));
+}
+static_always_inline u64
+clib_ip_csum_hadd_4 (u64x4 v)
+{
+ return clib_ip_csum_hadd_2 (u64x4_extract_lo (v) + u64x4_extract_hi (v));
+}
+#endif
+
+#if defined(CLIB_HAVE_VEC512)
+static_always_inline u64x8
+clib_ip_csum_cvt_and_add_16 (u32x16 v)
+{
+ return ((u64x8) u32x16_interleave_lo ((u32x16) v, u32x16_zero ()) +
+ (u64x8) u32x16_interleave_hi ((u32x16) v, u32x16_zero ()));
+}
+static_always_inline u64
+clib_ip_csum_hadd_8 (u64x8 v)
+{
+ return clib_ip_csum_hadd_4 (u64x8_extract_lo (v) + u64x8_extract_hi (v));
+}
+#endif
+
+static_always_inline void
+clib_ip_csum_inline (clib_ip_csum_t *c, u8 *dst, u8 *src, u16 count,
+ int is_copy)
+{
+ if (c->odd)
+ {
+ c->odd = 0;
+ c->sum += (u16) src[0] << 8;
+ count--;
+ src++;
+ if (is_copy)
+ dst++[0] = src[0];
+ }
+
+#if defined(CLIB_HAVE_VEC512)
+ u64x8 sum8 = {};
+
+ while (count >= 512)
+ {
+ u32x16u *s = (u32x16u *) src;
+ sum8 += clib_ip_csum_cvt_and_add_16 (s[0]);
+ sum8 += clib_ip_csum_cvt_and_add_16 (s[1]);
+ sum8 += clib_ip_csum_cvt_and_add_16 (s[2]);
+ sum8 += clib_ip_csum_cvt_and_add_16 (s[3]);
+ sum8 += clib_ip_csum_cvt_and_add_16 (s[8]);
+ sum8 += clib_ip_csum_cvt_and_add_16 (s[5]);
+ sum8 += clib_ip_csum_cvt_and_add_16 (s[6]);
+ sum8 += clib_ip_csum_cvt_and_add_16 (s[7]);
+ count -= 512;
+ src += 512;
+ if (is_copy)
+ {
+ u32x16u *d = (u32x16u *) dst;
+ d[0] = s[0];
+ d[1] = s[1];
+ d[2] = s[2];
+ d[3] = s[3];
+ d[4] = s[4];
+ d[5] = s[5];
+ d[6] = s[6];
+ d[7] = s[7];
+ dst += 512;
+ }
+ }
+
+ while (count >= 64)
+ {
+ u32x16u *s = (u32x16u *) src;
+ sum8 += clib_ip_csum_cvt_and_add_16 (s[0]);
+ count -= 64;
+ src += 64;
+ if (is_copy)
+ {
+ u32x16u *d = (u32x16u *) dst;
+ d[0] = s[0];
+ dst += 512;
+ }
+ }
+
+#ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE
+ if (count)
+ {
+ u64 mask = pow2_mask (count);
+ u32x16 v = (u32x16) u8x64_mask_load_zero (src, mask);
+ sum8 += clib_ip_csum_cvt_and_add_16 (v);
+ c->odd = count & 1;
+ if (is_copy)
+ u32x16_mask_store (v, dst, mask);
+ }
+ c->sum += clib_ip_csum_hadd_8 (sum8);
+ return;
+#endif
+
+ c->sum += clib_ip_csum_hadd_8 (sum8);
+#elif defined(CLIB_HAVE_VEC256)
+ u64x4 sum4 = {};
+
+ while (count >= 256)
+ {
+ u32x8u *s = (u32x8u *) src;
+ sum4 += clib_ip_csum_cvt_and_add_8 (s[0]);
+ sum4 += clib_ip_csum_cvt_and_add_8 (s[1]);
+ sum4 += clib_ip_csum_cvt_and_add_8 (s[2]);
+ sum4 += clib_ip_csum_cvt_and_add_8 (s[3]);
+ sum4 += clib_ip_csum_cvt_and_add_8 (s[4]);
+ sum4 += clib_ip_csum_cvt_and_add_8 (s[5]);
+ sum4 += clib_ip_csum_cvt_and_add_8 (s[6]);
+ sum4 += clib_ip_csum_cvt_and_add_8 (s[7]);
+ count -= 256;
+ src += 256;
+ if (is_copy)
+ {
+ u32x8u *d = (u32x8u *) dst;
+ d[0] = s[0];
+ d[1] = s[1];
+ d[2] = s[2];
+ d[3] = s[3];
+ d[4] = s[4];
+ d[5] = s[5];
+ d[6] = s[6];
+ d[7] = s[7];
+ dst += 256;
+ }
+ }
+
+ while (count >= 32)
+ {
+ u32x8u *s = (u32x8u *) src;
+ sum4 += clib_ip_csum_cvt_and_add_8 (s[0]);
+ count -= 32;
+ src += 32;
+ if (is_copy)
+ {
+ u32x8u *d = (u32x8u *) dst;
+ d[0] = s[0];
+ dst += 32;
+ }
+ }
+
+#ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE
+ if (count)
+ {
+ u32 mask = pow2_mask (count);
+ u32x8 v = (u32x8) u8x32_mask_load_zero (src, mask);
+ sum4 += clib_ip_csum_cvt_and_add_8 (v);
+ c->odd = count & 1;
+ if (is_copy)
+ u32x8_mask_store (v, dst, mask);
+ }
+ c->sum += clib_ip_csum_hadd_4 (sum4);
+ return;
+#endif
+
+ c->sum += clib_ip_csum_hadd_4 (sum4);
+#elif defined(CLIB_HAVE_VEC128)
+ u64x2 sum2 = {};
+
+ while (count >= 128)
+ {
+ u32x4u *s = (u32x4u *) src;
+ sum2 += clib_ip_csum_cvt_and_add_4 (s[0]);
+ sum2 += clib_ip_csum_cvt_and_add_4 (s[1]);
+ sum2 += clib_ip_csum_cvt_and_add_4 (s[2]);
+ sum2 += clib_ip_csum_cvt_and_add_4 (s[3]);
+ sum2 += clib_ip_csum_cvt_and_add_4 (s[4]);
+ sum2 += clib_ip_csum_cvt_and_add_4 (s[5]);
+ sum2 += clib_ip_csum_cvt_and_add_4 (s[6]);
+ sum2 += clib_ip_csum_cvt_and_add_4 (s[7]);
+ count -= 128;
+ src += 128;
+ if (is_copy)
+ {
+ u32x4u *d = (u32x4u *) dst;
+ d[0] = s[0];
+ d[1] = s[1];
+ d[2] = s[2];
+ d[3] = s[3];
+ d[4] = s[4];
+ d[5] = s[5];
+ d[6] = s[6];
+ d[7] = s[7];
+ dst += 128;
+ }
+ }
+
+ while (count >= 16)
+ {
+ u32x4u *s = (u32x4u *) src;
+ sum2 += clib_ip_csum_cvt_and_add_4 (s[0]);
+ count -= 16;
+ src += 16;
+ if (is_copy)
+ {
+ u32x4u *d = (u32x4u *) dst;
+ d[0] = s[0];
+ dst += 16;
+ }
+ }
+ c->sum += clib_ip_csum_hadd_2 (sum2);
+#else
+ while (count >= 4)
+ {
+ u32 v = *((u32 *) src);
+ c->sum += v;
+ count -= 4;
+ src += 4;
+ if (is_copy)
+ {
+ *(u32 *) dst = v;
+ dst += 4;
+ }
+ }
+#endif
+ while (count >= 2)
+ {
+ u16 v = *((u16 *) src);
+ c->sum += v;
+ count -= 2;
+ src += 2;
+ if (is_copy)
+ {
+ *(u16 *) dst = v;
+ dst += 2;
+ }
+ }
+
+ if (count)
+ {
+ c->odd = 1;
+ c->sum += (u16) src[0];
+ if (is_copy)
+ dst[0] = src[0];
+ }
+}
+
+static_always_inline u16
+clib_ip_csum_fold (clib_ip_csum_t *c)
+{
+ u64 sum = c->sum;
+#if defined(__x86_64__) && defined(__BMI2__)
+ u64 tmp = sum;
+ asm volatile(
+ /* using ADC is much faster than mov, shift, add sequence
+ * compiler produces */
+ "shr $32, %[sum] \n\t"
+ "add %k[tmp], %k[sum] \n\t"
+ "mov $16, %k[tmp] \n\t"
+ "shrx %k[tmp], %k[sum], %k[tmp] \n\t"
+ "adc %w[tmp], %w[sum] \n\t"
+ "adc $0, %w[sum] \n\t"
+ : [ sum ] "+&r"(sum), [ tmp ] "+&r"(tmp));
+#else
+ sum = ((u32) sum) + (sum >> 32);
+ sum = ((u16) sum) + (sum >> 16);
+ sum = ((u16) sum) + (sum >> 16);
+#endif
+ return (~((u16) sum));
+}
+
+static_always_inline void
+clib_ip_csum_chunk (clib_ip_csum_t *c, u8 *src, u16 count)
+{
+ return clib_ip_csum_inline (c, 0, src, count, 0);
+}
+
+static_always_inline void
+clib_ip_csum_and_copy_chunk (clib_ip_csum_t *c, u8 *src, u8 *dst, u16 count)
+{
+ return clib_ip_csum_inline (c, dst, src, count, 1);
+}
+
+static_always_inline u16
+clib_ip_csum (u8 *src, u16 count)
+{
+ clib_ip_csum_t c = {};
+ if (COMPILE_TIME_CONST (count) && count == 12)
+ {
+ for (int i = 0; i < 3; i++)
+ c.sum += ((u32 *) src)[i];
+ }
+ else if (COMPILE_TIME_CONST (count) && count == 20)
+ {
+ for (int i = 0; i < 5; i++)
+ c.sum += ((u32 *) src)[i];
+ }
+ else if (COMPILE_TIME_CONST (count) && count == 40)
+ {
+ for (int i = 0; i < 10; i++)
+ c.sum += ((u32 *) src)[i];
+ }
+ else
+ clib_ip_csum_inline (&c, 0, src, count, 0);
+ return clib_ip_csum_fold (&c);
+}
+
+static_always_inline u16
+clib_ip_csum_and_copy (u8 *dst, u8 *src, u16 count)
+{
+ clib_ip_csum_t c = {};
+ clib_ip_csum_inline (&c, dst, src, count, 1);
+ return clib_ip_csum_fold (&c);
+}
+
+#endif
diff --git a/src/vppinfra/vector/test/ip_csum.c b/src/vppinfra/vector/test/ip_csum.c
new file mode 100644
index 0000000..135d5ae
--- /dev/null
+++ b/src/vppinfra/vector/test/ip_csum.c
@@ -0,0 +1,120 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Cisco Systems, Inc.
+ */
+
+#include <vppinfra/format.h>
+#include <vppinfra/vector/test/test.h>
+#include <vppinfra/vector/ip_csum.h>
+
+typedef struct
+{
+ struct
+ {
+ u8 *src;
+ u32 count;
+ } chunk[5];
+ u16 result;
+} ip_csum_test_t;
+
+static u8 test1[] = { 0x45, 0x00, 0x00, 0x73, 0x00, 0x00, 0x40,
+ 0x00, 0x40, 0x11, 0x00, 0x00, 0xc0, 0xa8,
+ 0x00, 0x01, 0xc0, 0xa8, 0x00, 0xc7, 0x00 };
+#define TEST_LEN(x) (ARRAY_LEN (x) - 1)
+
+static ip_csum_test_t tests[] = { {
+ .chunk[0].src = test1,
+ .chunk[0].count = TEST_LEN (test1),
+ .result = 0x61b8,
+ },
+ {
+ .chunk[0].src = test1,
+ .chunk[0].count = 1,
+ .chunk[1].src = test1 + 1,
+ .chunk[1].count = 2,
+ .chunk[2].src = test1 + 3,
+ .chunk[2].count = 3,
+ .chunk[3].src = test1 + 6,
+ .chunk[3].count = 4,
+ .chunk[4].src = test1 + 10,
+ .chunk[4].count = TEST_LEN (test1) - 10,
+ .result = 0x61b8,
+ },
+ {
+ .chunk[0].count = 1,
+ .result = 0xff0f,
+ },
+ {
+ .chunk[0].count = 2,
+ .result = 0x080f,
+ },
+ {
+ .chunk[0].count = 3,
+ .result = 0x0711,
+ },
+ {
+ .chunk[0].count = 4,
+ .result = 0x1210,
+ },
+ {
+ .chunk[0].count = 63,
+ .result = 0xda01,
+ },
+ {
+ .chunk[0].count = 64,
+ .result = 0xe100,
+ },
+ {
+ .chunk[0].count = 65,
+ .result = 0xe010,
+ },
+ {
+ .chunk[0].count = 65535,
+ .result = 0xfc84,
+ },
+ {
+ .chunk[0].count = 65536,
+ .result = 0xffff,
+ } };
+
+static clib_error_t *
+test_clib_ip_csum (clib_error_t *err)
+{
+ u8 *buf;
+ buf = clib_mem_alloc_aligned (65536, CLIB_CACHE_LINE_BYTES);
+ for (int i = 0; i < 65536; i++)
+ buf[i] = 0xf0 + ((i * 7) & 0xf);
+
+ for (int i = 0; i < ARRAY_LEN (tests); i++)
+ {
+ clib_ip_csum_t c = {};
+ ip_csum_test_t *t = tests + i;
+ u16 rv;
+
+ for (int j = 0; j < ARRAY_LEN (((ip_csum_test_t *) 0)->chunk); j++)
+ if (t->chunk[j].count > 0)
+ {
+ if (t->chunk[j].src == 0)
+ clib_ip_csum_chunk (&c, buf, t->chunk[j].count);
+ else
+ clib_ip_csum_chunk (&c, t->chunk[j].src, t->chunk[j].count);
+ }
+ rv = clib_ip_csum_fold (&c);
+
+ if (rv != tests[i].result)
+ {
+ err = clib_error_return (err,
+ "bad checksum in test case %u (expected "
+ "0x%04x, calculated 0x%04x)",
+ i, tests[i].result, rv);
+ goto done;
+ }
+ }
+done:
+ clib_mem_free (buf);
+ return err;
+}
+
+REGISTER_TEST (clib_ip_csum) = {
+ .name = "clib_ip_csum",
+ .fn = test_clib_ip_csum,
+};
diff --git a/src/vppinfra/vector_neon.h b/src/vppinfra/vector_neon.h
index 70b05c6..80d7bda 100644
--- a/src/vppinfra/vector_neon.h
+++ b/src/vppinfra/vector_neon.h
@@ -211,6 +211,18 @@
#define u8x16_word_shift_left(x,n) vextq_u8(u8x16_splat (0), x, 16 - n)
#define u8x16_word_shift_right(x,n) vextq_u8(x, u8x16_splat (0), n)
+always_inline u32x4
+u32x4_interleave_hi (u32x4 a, u32x4 b)
+{
+ return (u32x4) vzip2q_u32 (a, b);
+}
+
+always_inline u32x4
+u32x4_interleave_lo (u32x4 a, u32x4 b)
+{
+ return (u32x4) vzip1q_u32 (a, b);
+}
+
static_always_inline u8x16
u8x16_reflect (u8x16 v)
{