vppinfra: new memcpy for x86_64

Change-Id: I5a5055580479960ac53e3f989aa188faf57fb05d
Type: improvement
Signed-off-by: Damjan Marion <damarion@cisco.com>
diff --git a/src/vppinfra/memcpy_x86_64.h b/src/vppinfra/memcpy_x86_64.h
new file mode 100644
index 0000000..9662ab4
--- /dev/null
+++ b/src/vppinfra/memcpy_x86_64.h
@@ -0,0 +1,611 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Damjan Marion
+ */
+
+#ifndef included_clib_memcpy_x86_64_h
+#define included_clib_memcpy_x86_64_h
+#ifdef __x86_64__
+
+#include <vppinfra/clib.h>
+#include <vppinfra/warnings.h>
+#include <stdio.h>
+
+/* clang-format off */
+WARN_OFF (stringop-overflow)
+/* clang-format on */
+
+static_always_inline void
+clib_memcpy1 (void *d, void *s)
+{
+  *(u8 *) d = *(u8 *) s;
+}
+
+static_always_inline void
+clib_memcpy2 (void *d, void *s)
+{
+  *(u16u *) d = *(u16u *) s;
+}
+
+static_always_inline void
+clib_memcpy4 (void *d, void *s)
+{
+  *(u32u *) d = *(u32u *) s;
+}
+
+static_always_inline void
+clib_memcpy8 (void *d, void *s)
+{
+  *(u64u *) d = *(u64u *) s;
+}
+
+#ifdef CLIB_HAVE_VEC128
+static_always_inline void
+clib_memcpy16 (void *d, void *s)
+{
+  *(u8x16u *) d = *(u8x16u *) s;
+}
+#endif
+
+#ifdef CLIB_HAVE_VEC256
+static_always_inline void
+clib_memcpy32 (void *d, void *s)
+{
+  *(u8x32u *) d = *(u8x32u *) s;
+}
+#endif
+
+#ifdef CLIB_HAVE_VEC512
+static_always_inline void
+clib_memcpy64 (void *d, void *s)
+{
+  *(u8x64u *) d = *(u8x64u *) s;
+}
+#endif
+
+static_always_inline void
+clib_memcpy_const_le32 (u8 *dst, u8 *src, size_t n)
+{
+  switch (n)
+    {
+    case 1:
+      clib_memcpy1 (dst, src);
+      break;
+    case 2:
+      clib_memcpy2 (dst, src);
+      break;
+    case 3:
+      clib_memcpy2 (dst, src);
+      clib_memcpy1 (dst + 2, src + 2);
+      break;
+    case 4:
+      clib_memcpy4 (dst, src);
+      break;
+    case 5:
+      clib_memcpy4 (dst, src);
+      clib_memcpy1 (dst + 4, src + 4);
+      break;
+    case 6:
+      clib_memcpy4 (dst, src);
+      clib_memcpy2 (dst + 4, src + 4);
+      break;
+    case 7:
+      clib_memcpy4 (dst, src);
+      clib_memcpy4 (dst + 3, src + 3);
+      break;
+    case 8:
+      clib_memcpy8 (dst, src);
+      break;
+    case 9:
+      clib_memcpy8 (dst, src);
+      clib_memcpy1 (dst + 8, src + 8);
+      break;
+    case 10:
+      clib_memcpy8 (dst, src);
+      clib_memcpy2 (dst + 8, src + 8);
+      break;
+    case 11:
+    case 12:
+      clib_memcpy8 (dst, src);
+      clib_memcpy4 (dst + n - 4, src + n - 4);
+      break;
+    case 13:
+    case 14:
+    case 15:
+      clib_memcpy8 (dst, src);
+      clib_memcpy8 (dst + n - 8, src + n - 8);
+      break;
+    case 16:
+      clib_memcpy16 (dst, src);
+      break;
+    case 17:
+      clib_memcpy16 (dst, src);
+      clib_memcpy1 (dst + 16, src + 16);
+      break;
+    case 18:
+      clib_memcpy16 (dst, src);
+      clib_memcpy2 (dst + 16, src + 16);
+      break;
+    case 20:
+      clib_memcpy16 (dst, src);
+      clib_memcpy4 (dst + 16, src + 16);
+      break;
+    case 24:
+      clib_memcpy16 (dst, src);
+      clib_memcpy8 (dst + 16, src + 16);
+      break;
+    default:
+      clib_memcpy16 (dst, src);
+      clib_memcpy16 (dst + n - 16, src + n - 16);
+      break;
+    }
+}
+
+static_always_inline void
+clib_memcpy_const_le64 (u8 *dst, u8 *src, size_t n)
+{
+  if (n < 32)
+    {
+      clib_memcpy_const_le32 (dst, src, n);
+      return;
+    }
+
+#if defined(CLIB_HAVE_VEC256)
+  switch (n)
+    {
+    case 32:
+      clib_memcpy32 (dst, src);
+      break;
+    case 33:
+      clib_memcpy32 (dst, src);
+      clib_memcpy1 (dst + 32, src + 32);
+      break;
+    case 34:
+      clib_memcpy32 (dst, src);
+      clib_memcpy2 (dst + 32, src + 32);
+      break;
+    case 36:
+      clib_memcpy32 (dst, src);
+      clib_memcpy4 (dst + 32, src + 32);
+      break;
+    case 40:
+      clib_memcpy32 (dst, src);
+      clib_memcpy8 (dst + 32, src + 32);
+      break;
+    case 48:
+      clib_memcpy32 (dst, src);
+      clib_memcpy16 (dst + 32, src + 32);
+      break;
+    default:
+      clib_memcpy32 (dst, src);
+      clib_memcpy32 (dst + n - 32, src + n - 32);
+      break;
+    }
+#else
+  while (n > 31)
+    {
+      clib_memcpy16 (dst, src);
+      clib_memcpy16 (dst + 16, src + 16);
+      dst += 32;
+      src += 32;
+      n -= 32;
+    }
+  clib_memcpy_const_le32 (dst, src, n);
+#endif
+}
+
+static_always_inline void
+clib_memcpy_x86_64_const (u8 *dst, u8 *src, size_t n)
+{
+#if defined(CLIB_HAVE_VEC512)
+  while (n > 128)
+    {
+      clib_memcpy64 (dst, src);
+      dst += 64;
+      src += 64;
+      n -= 64;
+    }
+
+  if (n < 64)
+    {
+      clib_memcpy_const_le64 (dst, src, n);
+      return;
+    }
+
+  switch (n)
+    {
+    case 64:
+      clib_memcpy64 (dst, src);
+      break;
+    case 65:
+      clib_memcpy64 (dst, src);
+      clib_memcpy1 (dst + 64, src + 64);
+      break;
+    case 66:
+      clib_memcpy64 (dst, src);
+      clib_memcpy2 (dst + 64, src + 64);
+      break;
+    case 68:
+      clib_memcpy64 (dst, src);
+      clib_memcpy4 (dst + 64, src + 64);
+      break;
+    case 72:
+      clib_memcpy64 (dst, src);
+      clib_memcpy8 (dst + 64, src + 64);
+      break;
+    case 80:
+      clib_memcpy64 (dst, src);
+      clib_memcpy16 (dst + 64, src + 64);
+      break;
+    case 96:
+      clib_memcpy64 (dst, src);
+      clib_memcpy32 (dst + 64, src + 64);
+      break;
+    default:
+      clib_memcpy64 (dst, src);
+      clib_memcpy64 (dst + n - 64, src + n - 64);
+      break;
+    }
+#elif defined(CLIB_HAVE_VEC256)
+  while (n > 64)
+    {
+      clib_memcpy32 (dst, src);
+      dst += 32;
+      src += 32;
+      n -= 32;
+    }
+  clib_memcpy_const_le64 (dst, src, n);
+#else
+  while (n > 32)
+    {
+      clib_memcpy16 (dst, src);
+      dst += 16;
+      src += 16;
+      n -= 16;
+    }
+  clib_memcpy_const_le32 (dst, src, n);
+#endif
+}
+
+static_always_inline void *
+clib_memcpy_x86_64 (void *restrict dst, const void *restrict src, size_t n)
+{
+  u8 *d = (u8 *) dst, *s = (u8 *) src;
+
+  if (n == 0)
+    return dst;
+
+  if (COMPILE_TIME_CONST (n))
+    {
+      if (n)
+	clib_memcpy_x86_64_const (d, s, n);
+      return dst;
+    }
+
+  if (n <= 32)
+    {
+#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
+      u32 mask = pow2_mask (n);
+      u8x32_mask_store (u8x32_mask_load_zero (s, mask), d, mask);
+#else
+      if (PREDICT_TRUE (n >= 16))
+	{
+	  clib_memcpy16 (d, s);
+	  clib_memcpy16 (d + n - 16, s + n - 16);
+	}
+      else if (PREDICT_TRUE (n >= 8))
+	{
+	  clib_memcpy8 (d, s);
+	  clib_memcpy8 (d + n - 8, s + n - 8);
+	}
+      else if (PREDICT_TRUE (n >= 4))
+	{
+	  clib_memcpy4 (d, s);
+	  clib_memcpy4 (d + n - 4, s + n - 4);
+	}
+      else if (PREDICT_TRUE (n > 1))
+	{
+	  clib_memcpy2 (d, s);
+	  clib_memcpy2 (d + n - 2, s + n - 2);
+	}
+      else
+	clib_memcpy1 (d, s);
+#endif
+    }
+#ifdef CLIB_HAVE_VEC512
+  else
+    {
+      u8x64 v0, v1, v2, v3;
+      u64 final_off, nr, off = 64;
+
+      if (n <= 64)
+	{
+	  n -= 32;
+	  u8x32_store_unaligned (u8x32_load_unaligned (s), d);
+	  u8x32_store_unaligned (u8x32_load_unaligned (s + n), d + n);
+	  return dst;
+	}
+
+      u8x64_store_unaligned (u8x64_load_unaligned (s), d);
+
+      if (n <= 128)
+	goto done2;
+
+      if (n <= 192)
+	goto one;
+
+      if (n <= 512 + 64)
+	{
+	  nr = round_pow2 (n - 128, 64);
+	  goto last;
+	}
+
+      off -= ((u64) d) & 0x3f;
+      nr = round_pow2 (n - off - 64, 64);
+      final_off = (nr & ~(u64) 0x1ff) + off;
+
+    more:
+      v0 = u8x64_load_unaligned (s + off + 0x000);
+      v1 = u8x64_load_unaligned (s + off + 0x040);
+      v2 = u8x64_load_unaligned (s + off + 0x080);
+      v3 = u8x64_load_unaligned (s + off + 0x0c0);
+      u8x64_store_unaligned (v0, d + off + 0x000);
+      u8x64_store_unaligned (v1, d + off + 0x040);
+      u8x64_store_unaligned (v2, d + off + 0x080);
+      u8x64_store_unaligned (v3, d + off + 0x0c0);
+      v0 = u8x64_load_unaligned (s + off + 0x100);
+      v1 = u8x64_load_unaligned (s + off + 0x140);
+      v2 = u8x64_load_unaligned (s + off + 0x180);
+      v3 = u8x64_load_unaligned (s + off + 0x1c0);
+      u8x64_store_unaligned (v0, d + off + 0x100);
+      u8x64_store_unaligned (v1, d + off + 0x140);
+      u8x64_store_unaligned (v2, d + off + 0x180);
+      u8x64_store_unaligned (v3, d + off + 0x1c0);
+      off += 512;
+      if (off != final_off)
+	goto more;
+
+      if ((nr & 0x1ff) == 0)
+	goto done2;
+
+    last:
+      if (PREDICT_TRUE (nr & 256))
+	{
+	  v0 = u8x64_load_unaligned (s + off + 0x000);
+	  v1 = u8x64_load_unaligned (s + off + 0x040);
+	  v2 = u8x64_load_unaligned (s + off + 0x080);
+	  v3 = u8x64_load_unaligned (s + off + 0x0c0);
+	  u8x64_store_unaligned (v0, d + off + 0x000);
+	  u8x64_store_unaligned (v1, d + off + 0x040);
+	  u8x64_store_unaligned (v2, d + off + 0x080);
+	  u8x64_store_unaligned (v3, d + off + 0x0c0);
+	  off += 256;
+	}
+      if (PREDICT_TRUE (nr & 128))
+	{
+	  v0 = u8x64_load_unaligned (s + off + 0x000);
+	  v1 = u8x64_load_unaligned (s + off + 0x040);
+	  u8x64_store_unaligned (v0, d + off + 0x000);
+	  u8x64_store_unaligned (v1, d + off + 0x040);
+	  off += 128;
+	}
+      if (PREDICT_TRUE (nr & 64))
+	{
+	one:
+	  u8x64_store_unaligned (u8x64_load_unaligned (s + off), d + off);
+	}
+    done2:
+      u8x64_store_unaligned (u8x64_load_unaligned (s + n - 64), d + n - 64);
+    }
+  return dst;
+#elif defined(CLIB_HAVE_VEC256)
+  else
+    {
+      u8x32 v0, v1, v2, v3;
+      u64 final_off, nr, off = 32;
+
+      u8x32_store_unaligned (u8x32_load_unaligned (s), d);
+
+      if (n <= 64)
+	goto done2;
+
+      if (n <= 96)
+	goto one;
+
+      if (n <= 256 + 32)
+	{
+	  nr = round_pow2 (n - 64, 32);
+	  goto last;
+	}
+
+      off -= ((u64) d) & 0x1f;
+      nr = round_pow2 (n - off - 32, 32);
+      final_off = (nr & ~(u64) 0xff) + off;
+
+    more:
+      v0 = u8x32_load_unaligned (s + off + 0x00);
+      v1 = u8x32_load_unaligned (s + off + 0x20);
+      v2 = u8x32_load_unaligned (s + off + 0x40);
+      v3 = u8x32_load_unaligned (s + off + 0x60);
+      u8x32_store_unaligned (v0, d + off + 0x00);
+      u8x32_store_unaligned (v1, d + off + 0x20);
+      u8x32_store_unaligned (v2, d + off + 0x40);
+      u8x32_store_unaligned (v3, d + off + 0x60);
+      v0 = u8x32_load_unaligned (s + off + 0x80);
+      v1 = u8x32_load_unaligned (s + off + 0xa0);
+      v2 = u8x32_load_unaligned (s + off + 0xc0);
+      v3 = u8x32_load_unaligned (s + off + 0xe0);
+      u8x32_store_unaligned (v0, d + off + 0x80);
+      u8x32_store_unaligned (v1, d + off + 0xa0);
+      u8x32_store_unaligned (v2, d + off + 0xc0);
+      u8x32_store_unaligned (v3, d + off + 0xe0);
+      off += 256;
+      if (off != final_off)
+	goto more;
+
+      if ((nr & 0xff) == 0)
+	goto done2;
+
+    last:
+      if (PREDICT_TRUE (nr & 128))
+	{
+	  v0 = u8x32_load_unaligned (s + off + 0x00);
+	  v1 = u8x32_load_unaligned (s + off + 0x20);
+	  v2 = u8x32_load_unaligned (s + off + 0x40);
+	  v3 = u8x32_load_unaligned (s + off + 0x60);
+	  u8x32_store_unaligned (v0, d + off + 0x00);
+	  u8x32_store_unaligned (v1, d + off + 0x20);
+	  u8x32_store_unaligned (v2, d + off + 0x40);
+	  u8x32_store_unaligned (v3, d + off + 0x60);
+	  off += 128;
+	}
+      if (PREDICT_TRUE (nr & 64))
+	{
+	  v0 = u8x32_load_unaligned (s + off + 0x00);
+	  v1 = u8x32_load_unaligned (s + off + 0x20);
+	  u8x32_store_unaligned (v0, d + off + 0x00);
+	  u8x32_store_unaligned (v1, d + off + 0x20);
+	  off += 64;
+	}
+      if (PREDICT_TRUE (nr & 32))
+	{
+	one:
+	  u8x32_store_unaligned (u8x32_load_unaligned (s + off), d + off);
+	}
+    done2:
+      u8x32_store_unaligned (u8x32_load_unaligned (s + n - 32), d + n - 32);
+    }
+  return dst;
+#elif defined(CLIB_HAVE_VEC128)
+  else
+    {
+      u8x16 v0, v1, v2, v3;
+      u64 final_off, nr, off = 32;
+
+      if (0 && n > 389)
+	{
+	  __builtin_memcpy (d, s, n);
+	  return dst;
+	}
+
+      u8x16_store_unaligned (u8x16_load_unaligned (s), d);
+      u8x16_store_unaligned (u8x16_load_unaligned (s + 16), d + 16);
+
+      if (n <= 48)
+	goto done2;
+
+      if (n <= 64)
+	goto one;
+
+      if (n <= 256 + 32)
+	{
+	  nr = round_pow2 (n - 48, 16);
+	  goto last;
+	}
+
+      off -= ((u64) d) & 0x0f;
+      nr = round_pow2 (n - off - 16, 16);
+      final_off = (nr & ~(u64) 0xff) + off;
+
+    more:
+      v0 = u8x16_load_unaligned (s + off + 0x00);
+      v1 = u8x16_load_unaligned (s + off + 0x10);
+      v2 = u8x16_load_unaligned (s + off + 0x20);
+      v3 = u8x16_load_unaligned (s + off + 0x30);
+      u8x16_store_unaligned (v0, d + off + 0x00);
+      u8x16_store_unaligned (v1, d + off + 0x10);
+      u8x16_store_unaligned (v2, d + off + 0x20);
+      u8x16_store_unaligned (v3, d + off + 0x30);
+      v0 = u8x16_load_unaligned (s + off + 0x40);
+      v1 = u8x16_load_unaligned (s + off + 0x50);
+      v2 = u8x16_load_unaligned (s + off + 0x60);
+      v3 = u8x16_load_unaligned (s + off + 0x70);
+      u8x16_store_unaligned (v0, d + off + 0x40);
+      u8x16_store_unaligned (v1, d + off + 0x50);
+      u8x16_store_unaligned (v2, d + off + 0x60);
+      u8x16_store_unaligned (v3, d + off + 0x70);
+      v0 = u8x16_load_unaligned (s + off + 0x80);
+      v1 = u8x16_load_unaligned (s + off + 0x90);
+      v2 = u8x16_load_unaligned (s + off + 0xa0);
+      v3 = u8x16_load_unaligned (s + off + 0xb0);
+      u8x16_store_unaligned (v0, d + off + 0x80);
+      u8x16_store_unaligned (v1, d + off + 0x90);
+      u8x16_store_unaligned (v2, d + off + 0xa0);
+      u8x16_store_unaligned (v3, d + off + 0xb0);
+      v0 = u8x16_load_unaligned (s + off + 0xc0);
+      v1 = u8x16_load_unaligned (s + off + 0xd0);
+      v2 = u8x16_load_unaligned (s + off + 0xe0);
+      v3 = u8x16_load_unaligned (s + off + 0xf0);
+      u8x16_store_unaligned (v0, d + off + 0xc0);
+      u8x16_store_unaligned (v1, d + off + 0xd0);
+      u8x16_store_unaligned (v2, d + off + 0xe0);
+      u8x16_store_unaligned (v3, d + off + 0xf0);
+      off += 256;
+      if (off != final_off)
+	goto more;
+
+      if ((nr & 0xff) == 0)
+	goto done2;
+
+    last:
+      if (PREDICT_TRUE (nr & 128))
+	{
+	  v0 = u8x16_load_unaligned (s + off + 0x00);
+	  v1 = u8x16_load_unaligned (s + off + 0x10);
+	  v2 = u8x16_load_unaligned (s + off + 0x20);
+	  v3 = u8x16_load_unaligned (s + off + 0x30);
+	  u8x16_store_unaligned (v0, d + off + 0x00);
+	  u8x16_store_unaligned (v1, d + off + 0x10);
+	  u8x16_store_unaligned (v2, d + off + 0x20);
+	  u8x16_store_unaligned (v3, d + off + 0x30);
+	  v0 = u8x16_load_unaligned (s + off + 0x40);
+	  v1 = u8x16_load_unaligned (s + off + 0x50);
+	  v2 = u8x16_load_unaligned (s + off + 0x60);
+	  v3 = u8x16_load_unaligned (s + off + 0x70);
+	  u8x16_store_unaligned (v0, d + off + 0x40);
+	  u8x16_store_unaligned (v1, d + off + 0x50);
+	  u8x16_store_unaligned (v2, d + off + 0x60);
+	  u8x16_store_unaligned (v3, d + off + 0x70);
+	  off += 128;
+	}
+      if (PREDICT_TRUE (nr & 64))
+	{
+	  v0 = u8x16_load_unaligned (s + off + 0x00);
+	  v1 = u8x16_load_unaligned (s + off + 0x10);
+	  v2 = u8x16_load_unaligned (s + off + 0x20);
+	  v3 = u8x16_load_unaligned (s + off + 0x30);
+	  u8x16_store_unaligned (v0, d + off + 0x00);
+	  u8x16_store_unaligned (v1, d + off + 0x10);
+	  u8x16_store_unaligned (v2, d + off + 0x20);
+	  u8x16_store_unaligned (v3, d + off + 0x30);
+	  off += 64;
+	}
+      if (PREDICT_TRUE (nr & 32))
+	{
+	  v0 = u8x16_load_unaligned (s + off + 0x00);
+	  v1 = u8x16_load_unaligned (s + off + 0x10);
+	  u8x16_store_unaligned (v0, d + off + 0x00);
+	  u8x16_store_unaligned (v1, d + off + 0x10);
+	  off += 32;
+	}
+      if (PREDICT_TRUE (nr & 16))
+	{
+	one:
+	  u8x16_store_unaligned (u8x16_load_unaligned (s + off), d + off);
+	}
+    done2:
+      u8x16_store_unaligned (u8x16_load_unaligned (s + n - 16), d + n - 16);
+    }
+  return dst;
+#else
+#error "SSE/AVX2/AVX512 must be enabled"
+#endif
+
+  return dst;
+}
+
+/* clang-format off */
+WARN_ON (stringop-overflow)
+/* clang-format on */
+
+#endif
+#endif