vppinfra: new vectorized ip checksum functions incl. csum_and_copy

Type: improvement
Change-Id: Id5810b7f4a6d6e4ce16b73c235b50db5d475ebf7
Signed-off-by: Damjan Marion <damarion@cisco.com>
diff --git a/src/vppinfra/vector/ip_csum.h b/src/vppinfra/vector/ip_csum.h
new file mode 100644
index 0000000..2cea9b4
--- /dev/null
+++ b/src/vppinfra/vector/ip_csum.h
@@ -0,0 +1,339 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Cisco Systems, Inc.
+ */
+
+#ifndef included_vector_ip_csum_h
+#define included_vector_ip_csum_h
+#include <vppinfra/clib.h>
+typedef struct
+{
+  u64 sum;
+  u8 odd;
+} clib_ip_csum_t;
+
+#if defined(CLIB_HAVE_VEC128)
+static_always_inline u64x2
+clib_ip_csum_cvt_and_add_4 (u32x4 v)
+{
+  return ((u64x2) u32x4_interleave_lo ((u32x4) v, u32x4_zero ()) +
+	  (u64x2) u32x4_interleave_hi ((u32x4) v, u32x4_zero ()));
+}
+static_always_inline u64
+clib_ip_csum_hadd_2 (u64x2 v)
+{
+  return v[0] + v[1];
+}
+#endif
+
+#if defined(CLIB_HAVE_VEC256)
+static_always_inline u64x4
+clib_ip_csum_cvt_and_add_8 (u32x8 v)
+{
+  return ((u64x4) u32x8_interleave_lo ((u32x8) v, u32x8_zero ()) +
+	  (u64x4) u32x8_interleave_hi ((u32x8) v, u32x8_zero ()));
+}
+static_always_inline u64
+clib_ip_csum_hadd_4 (u64x4 v)
+{
+  return clib_ip_csum_hadd_2 (u64x4_extract_lo (v) + u64x4_extract_hi (v));
+}
+#endif
+
+#if defined(CLIB_HAVE_VEC512)
+static_always_inline u64x8
+clib_ip_csum_cvt_and_add_16 (u32x16 v)
+{
+  return ((u64x8) u32x16_interleave_lo ((u32x16) v, u32x16_zero ()) +
+	  (u64x8) u32x16_interleave_hi ((u32x16) v, u32x16_zero ()));
+}
+static_always_inline u64
+clib_ip_csum_hadd_8 (u64x8 v)
+{
+  return clib_ip_csum_hadd_4 (u64x8_extract_lo (v) + u64x8_extract_hi (v));
+}
+#endif
+
+static_always_inline void
+clib_ip_csum_inline (clib_ip_csum_t *c, u8 *dst, u8 *src, u16 count,
+		     int is_copy)
+{
+  if (c->odd)
+    {
+      c->odd = 0;
+      c->sum += (u16) src[0] << 8;
+      count--;
+      src++;
+      if (is_copy)
+	dst++[0] = src[0];
+    }
+
+#if defined(CLIB_HAVE_VEC512)
+  u64x8 sum8 = {};
+
+  while (count >= 512)
+    {
+      u32x16u *s = (u32x16u *) src;
+      sum8 += clib_ip_csum_cvt_and_add_16 (s[0]);
+      sum8 += clib_ip_csum_cvt_and_add_16 (s[1]);
+      sum8 += clib_ip_csum_cvt_and_add_16 (s[2]);
+      sum8 += clib_ip_csum_cvt_and_add_16 (s[3]);
+      sum8 += clib_ip_csum_cvt_and_add_16 (s[8]);
+      sum8 += clib_ip_csum_cvt_and_add_16 (s[5]);
+      sum8 += clib_ip_csum_cvt_and_add_16 (s[6]);
+      sum8 += clib_ip_csum_cvt_and_add_16 (s[7]);
+      count -= 512;
+      src += 512;
+      if (is_copy)
+	{
+	  u32x16u *d = (u32x16u *) dst;
+	  d[0] = s[0];
+	  d[1] = s[1];
+	  d[2] = s[2];
+	  d[3] = s[3];
+	  d[4] = s[4];
+	  d[5] = s[5];
+	  d[6] = s[6];
+	  d[7] = s[7];
+	  dst += 512;
+	}
+    }
+
+  while (count >= 64)
+    {
+      u32x16u *s = (u32x16u *) src;
+      sum8 += clib_ip_csum_cvt_and_add_16 (s[0]);
+      count -= 64;
+      src += 64;
+      if (is_copy)
+	{
+	  u32x16u *d = (u32x16u *) dst;
+	  d[0] = s[0];
+	  dst += 512;
+	}
+    }
+
+#ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE
+  if (count)
+    {
+      u64 mask = pow2_mask (count);
+      u32x16 v = (u32x16) u8x64_mask_load_zero (src, mask);
+      sum8 += clib_ip_csum_cvt_and_add_16 (v);
+      c->odd = count & 1;
+      if (is_copy)
+	u32x16_mask_store (v, dst, mask);
+    }
+  c->sum += clib_ip_csum_hadd_8 (sum8);
+  return;
+#endif
+
+  c->sum += clib_ip_csum_hadd_8 (sum8);
+#elif defined(CLIB_HAVE_VEC256)
+  u64x4 sum4 = {};
+
+  while (count >= 256)
+    {
+      u32x8u *s = (u32x8u *) src;
+      sum4 += clib_ip_csum_cvt_and_add_8 (s[0]);
+      sum4 += clib_ip_csum_cvt_and_add_8 (s[1]);
+      sum4 += clib_ip_csum_cvt_and_add_8 (s[2]);
+      sum4 += clib_ip_csum_cvt_and_add_8 (s[3]);
+      sum4 += clib_ip_csum_cvt_and_add_8 (s[4]);
+      sum4 += clib_ip_csum_cvt_and_add_8 (s[5]);
+      sum4 += clib_ip_csum_cvt_and_add_8 (s[6]);
+      sum4 += clib_ip_csum_cvt_and_add_8 (s[7]);
+      count -= 256;
+      src += 256;
+      if (is_copy)
+	{
+	  u32x8u *d = (u32x8u *) dst;
+	  d[0] = s[0];
+	  d[1] = s[1];
+	  d[2] = s[2];
+	  d[3] = s[3];
+	  d[4] = s[4];
+	  d[5] = s[5];
+	  d[6] = s[6];
+	  d[7] = s[7];
+	  dst += 256;
+	}
+    }
+
+  while (count >= 32)
+    {
+      u32x8u *s = (u32x8u *) src;
+      sum4 += clib_ip_csum_cvt_and_add_8 (s[0]);
+      count -= 32;
+      src += 32;
+      if (is_copy)
+	{
+	  u32x8u *d = (u32x8u *) dst;
+	  d[0] = s[0];
+	  dst += 32;
+	}
+    }
+
+#ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE
+  if (count)
+    {
+      u32 mask = pow2_mask (count);
+      u32x8 v = (u32x8) u8x32_mask_load_zero (src, mask);
+      sum4 += clib_ip_csum_cvt_and_add_8 (v);
+      c->odd = count & 1;
+      if (is_copy)
+	u32x8_mask_store (v, dst, mask);
+    }
+  c->sum += clib_ip_csum_hadd_4 (sum4);
+  return;
+#endif
+
+  c->sum += clib_ip_csum_hadd_4 (sum4);
+#elif defined(CLIB_HAVE_VEC128)
+  u64x2 sum2 = {};
+
+  while (count >= 128)
+    {
+      u32x4u *s = (u32x4u *) src;
+      sum2 += clib_ip_csum_cvt_and_add_4 (s[0]);
+      sum2 += clib_ip_csum_cvt_and_add_4 (s[1]);
+      sum2 += clib_ip_csum_cvt_and_add_4 (s[2]);
+      sum2 += clib_ip_csum_cvt_and_add_4 (s[3]);
+      sum2 += clib_ip_csum_cvt_and_add_4 (s[4]);
+      sum2 += clib_ip_csum_cvt_and_add_4 (s[5]);
+      sum2 += clib_ip_csum_cvt_and_add_4 (s[6]);
+      sum2 += clib_ip_csum_cvt_and_add_4 (s[7]);
+      count -= 128;
+      src += 128;
+      if (is_copy)
+	{
+	  u32x4u *d = (u32x4u *) dst;
+	  d[0] = s[0];
+	  d[1] = s[1];
+	  d[2] = s[2];
+	  d[3] = s[3];
+	  d[4] = s[4];
+	  d[5] = s[5];
+	  d[6] = s[6];
+	  d[7] = s[7];
+	  dst += 128;
+	}
+    }
+
+  while (count >= 16)
+    {
+      u32x4u *s = (u32x4u *) src;
+      sum2 += clib_ip_csum_cvt_and_add_4 (s[0]);
+      count -= 16;
+      src += 16;
+      if (is_copy)
+	{
+	  u32x4u *d = (u32x4u *) dst;
+	  d[0] = s[0];
+	  dst += 16;
+	}
+    }
+  c->sum += clib_ip_csum_hadd_2 (sum2);
+#else
+  while (count >= 4)
+    {
+      u32 v = *((u32 *) src);
+      c->sum += v;
+      count -= 4;
+      src += 4;
+      if (is_copy)
+	{
+	  *(u32 *) dst = v;
+	  dst += 4;
+	}
+    }
+#endif
+  while (count >= 2)
+    {
+      u16 v = *((u16 *) src);
+      c->sum += v;
+      count -= 2;
+      src += 2;
+      if (is_copy)
+	{
+	  *(u16 *) dst = v;
+	  dst += 2;
+	}
+    }
+
+  if (count)
+    {
+      c->odd = 1;
+      c->sum += (u16) src[0];
+      if (is_copy)
+	dst[0] = src[0];
+    }
+}
+
+static_always_inline u16
+clib_ip_csum_fold (clib_ip_csum_t *c)
+{
+  u64 sum = c->sum;
+#if defined(__x86_64__) && defined(__BMI2__)
+  u64 tmp = sum;
+  asm volatile(
+    /* using ADC is much faster than mov, shift, add sequence
+     * compiler produces */
+    "shr	$32, %[sum]			\n\t"
+    "add	%k[tmp], %k[sum]		\n\t"
+    "mov	$16, %k[tmp]			\n\t"
+    "shrx	%k[tmp], %k[sum], %k[tmp]	\n\t"
+    "adc	%w[tmp], %w[sum]		\n\t"
+    "adc	$0, %w[sum]			\n\t"
+    : [ sum ] "+&r"(sum), [ tmp ] "+&r"(tmp));
+#else
+  sum = ((u32) sum) + (sum >> 32);
+  sum = ((u16) sum) + (sum >> 16);
+  sum = ((u16) sum) + (sum >> 16);
+#endif
+  return (~((u16) sum));
+}
+
+static_always_inline void
+clib_ip_csum_chunk (clib_ip_csum_t *c, u8 *src, u16 count)
+{
+  return clib_ip_csum_inline (c, 0, src, count, 0);
+}
+
+static_always_inline void
+clib_ip_csum_and_copy_chunk (clib_ip_csum_t *c, u8 *src, u8 *dst, u16 count)
+{
+  return clib_ip_csum_inline (c, dst, src, count, 1);
+}
+
+static_always_inline u16
+clib_ip_csum (u8 *src, u16 count)
+{
+  clib_ip_csum_t c = {};
+  if (COMPILE_TIME_CONST (count) && count == 12)
+    {
+      for (int i = 0; i < 3; i++)
+	c.sum += ((u32 *) src)[i];
+    }
+  else if (COMPILE_TIME_CONST (count) && count == 20)
+    {
+      for (int i = 0; i < 5; i++)
+	c.sum += ((u32 *) src)[i];
+    }
+  else if (COMPILE_TIME_CONST (count) && count == 40)
+    {
+      for (int i = 0; i < 10; i++)
+	c.sum += ((u32 *) src)[i];
+    }
+  else
+    clib_ip_csum_inline (&c, 0, src, count, 0);
+  return clib_ip_csum_fold (&c);
+}
+
+static_always_inline u16
+clib_ip_csum_and_copy (u8 *dst, u8 *src, u16 count)
+{
+  clib_ip_csum_t c = {};
+  clib_ip_csum_inline (&c, dst, src, count, 1);
+  return clib_ip_csum_fold (&c);
+}
+
+#endif
diff --git a/src/vppinfra/vector/test/ip_csum.c b/src/vppinfra/vector/test/ip_csum.c
new file mode 100644
index 0000000..135d5ae
--- /dev/null
+++ b/src/vppinfra/vector/test/ip_csum.c
@@ -0,0 +1,120 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Cisco Systems, Inc.
+ */
+
+#include <vppinfra/format.h>
+#include <vppinfra/vector/test/test.h>
+#include <vppinfra/vector/ip_csum.h>
+
+typedef struct
+{
+  struct
+  {
+    u8 *src;
+    u32 count;
+  } chunk[5];
+  u16 result;
+} ip_csum_test_t;
+
+static u8 test1[] = { 0x45, 0x00, 0x00, 0x73, 0x00, 0x00, 0x40,
+		      0x00, 0x40, 0x11, 0x00, 0x00, 0xc0, 0xa8,
+		      0x00, 0x01, 0xc0, 0xa8, 0x00, 0xc7, 0x00 };
+#define TEST_LEN(x) (ARRAY_LEN (x) - 1)
+
+static ip_csum_test_t tests[] = { {
+				    .chunk[0].src = test1,
+				    .chunk[0].count = TEST_LEN (test1),
+				    .result = 0x61b8,
+				  },
+				  {
+				    .chunk[0].src = test1,
+				    .chunk[0].count = 1,
+				    .chunk[1].src = test1 + 1,
+				    .chunk[1].count = 2,
+				    .chunk[2].src = test1 + 3,
+				    .chunk[2].count = 3,
+				    .chunk[3].src = test1 + 6,
+				    .chunk[3].count = 4,
+				    .chunk[4].src = test1 + 10,
+				    .chunk[4].count = TEST_LEN (test1) - 10,
+				    .result = 0x61b8,
+				  },
+				  {
+				    .chunk[0].count = 1,
+				    .result = 0xff0f,
+				  },
+				  {
+				    .chunk[0].count = 2,
+				    .result = 0x080f,
+				  },
+				  {
+				    .chunk[0].count = 3,
+				    .result = 0x0711,
+				  },
+				  {
+				    .chunk[0].count = 4,
+				    .result = 0x1210,
+				  },
+				  {
+				    .chunk[0].count = 63,
+				    .result = 0xda01,
+				  },
+				  {
+				    .chunk[0].count = 64,
+				    .result = 0xe100,
+				  },
+				  {
+				    .chunk[0].count = 65,
+				    .result = 0xe010,
+				  },
+				  {
+				    .chunk[0].count = 65535,
+				    .result = 0xfc84,
+				  },
+				  {
+				    .chunk[0].count = 65536,
+				    .result = 0xffff,
+				  } };
+
+static clib_error_t *
+test_clib_ip_csum (clib_error_t *err)
+{
+  u8 *buf;
+  buf = clib_mem_alloc_aligned (65536, CLIB_CACHE_LINE_BYTES);
+  for (int i = 0; i < 65536; i++)
+    buf[i] = 0xf0 + ((i * 7) & 0xf);
+
+  for (int i = 0; i < ARRAY_LEN (tests); i++)
+    {
+      clib_ip_csum_t c = {};
+      ip_csum_test_t *t = tests + i;
+      u16 rv;
+
+      for (int j = 0; j < ARRAY_LEN (((ip_csum_test_t *) 0)->chunk); j++)
+	if (t->chunk[j].count > 0)
+	  {
+	    if (t->chunk[j].src == 0)
+	      clib_ip_csum_chunk (&c, buf, t->chunk[j].count);
+	    else
+	      clib_ip_csum_chunk (&c, t->chunk[j].src, t->chunk[j].count);
+	  }
+      rv = clib_ip_csum_fold (&c);
+
+      if (rv != tests[i].result)
+	{
+	  err = clib_error_return (err,
+				   "bad checksum in test case %u (expected "
+				   "0x%04x, calculated 0x%04x)",
+				   i, tests[i].result, rv);
+	  goto done;
+	}
+    }
+done:
+  clib_mem_free (buf);
+  return err;
+}
+
+REGISTER_TEST (clib_ip_csum) = {
+  .name = "clib_ip_csum",
+  .fn = test_clib_ip_csum,
+};