blob: 9662ab4e7ef9d096ac175c5b8fcb33888bb59ad1 [file] [log] [blame]
/* SPDX-License-Identifier: Apache-2.0
* Copyright(c) 2021 Damjan Marion
*/
#ifndef included_clib_memcpy_x86_64_h
#define included_clib_memcpy_x86_64_h
#ifdef __x86_64__
#include <vppinfra/clib.h>
#include <vppinfra/warnings.h>
#include <stdio.h>
/* clang-format off */
WARN_OFF (stringop-overflow)
/* clang-format on */
static_always_inline void
clib_memcpy1 (void *d, void *s)
{
*(u8 *) d = *(u8 *) s;
}
static_always_inline void
clib_memcpy2 (void *d, void *s)
{
*(u16u *) d = *(u16u *) s;
}
static_always_inline void
clib_memcpy4 (void *d, void *s)
{
*(u32u *) d = *(u32u *) s;
}
static_always_inline void
clib_memcpy8 (void *d, void *s)
{
*(u64u *) d = *(u64u *) s;
}
#ifdef CLIB_HAVE_VEC128
static_always_inline void
clib_memcpy16 (void *d, void *s)
{
*(u8x16u *) d = *(u8x16u *) s;
}
#endif
#ifdef CLIB_HAVE_VEC256
static_always_inline void
clib_memcpy32 (void *d, void *s)
{
*(u8x32u *) d = *(u8x32u *) s;
}
#endif
#ifdef CLIB_HAVE_VEC512
static_always_inline void
clib_memcpy64 (void *d, void *s)
{
*(u8x64u *) d = *(u8x64u *) s;
}
#endif
static_always_inline void
clib_memcpy_const_le32 (u8 *dst, u8 *src, size_t n)
{
switch (n)
{
case 1:
clib_memcpy1 (dst, src);
break;
case 2:
clib_memcpy2 (dst, src);
break;
case 3:
clib_memcpy2 (dst, src);
clib_memcpy1 (dst + 2, src + 2);
break;
case 4:
clib_memcpy4 (dst, src);
break;
case 5:
clib_memcpy4 (dst, src);
clib_memcpy1 (dst + 4, src + 4);
break;
case 6:
clib_memcpy4 (dst, src);
clib_memcpy2 (dst + 4, src + 4);
break;
case 7:
clib_memcpy4 (dst, src);
clib_memcpy4 (dst + 3, src + 3);
break;
case 8:
clib_memcpy8 (dst, src);
break;
case 9:
clib_memcpy8 (dst, src);
clib_memcpy1 (dst + 8, src + 8);
break;
case 10:
clib_memcpy8 (dst, src);
clib_memcpy2 (dst + 8, src + 8);
break;
case 11:
case 12:
clib_memcpy8 (dst, src);
clib_memcpy4 (dst + n - 4, src + n - 4);
break;
case 13:
case 14:
case 15:
clib_memcpy8 (dst, src);
clib_memcpy8 (dst + n - 8, src + n - 8);
break;
case 16:
clib_memcpy16 (dst, src);
break;
case 17:
clib_memcpy16 (dst, src);
clib_memcpy1 (dst + 16, src + 16);
break;
case 18:
clib_memcpy16 (dst, src);
clib_memcpy2 (dst + 16, src + 16);
break;
case 20:
clib_memcpy16 (dst, src);
clib_memcpy4 (dst + 16, src + 16);
break;
case 24:
clib_memcpy16 (dst, src);
clib_memcpy8 (dst + 16, src + 16);
break;
default:
clib_memcpy16 (dst, src);
clib_memcpy16 (dst + n - 16, src + n - 16);
break;
}
}
static_always_inline void
clib_memcpy_const_le64 (u8 *dst, u8 *src, size_t n)
{
if (n < 32)
{
clib_memcpy_const_le32 (dst, src, n);
return;
}
#if defined(CLIB_HAVE_VEC256)
switch (n)
{
case 32:
clib_memcpy32 (dst, src);
break;
case 33:
clib_memcpy32 (dst, src);
clib_memcpy1 (dst + 32, src + 32);
break;
case 34:
clib_memcpy32 (dst, src);
clib_memcpy2 (dst + 32, src + 32);
break;
case 36:
clib_memcpy32 (dst, src);
clib_memcpy4 (dst + 32, src + 32);
break;
case 40:
clib_memcpy32 (dst, src);
clib_memcpy8 (dst + 32, src + 32);
break;
case 48:
clib_memcpy32 (dst, src);
clib_memcpy16 (dst + 32, src + 32);
break;
default:
clib_memcpy32 (dst, src);
clib_memcpy32 (dst + n - 32, src + n - 32);
break;
}
#else
while (n > 31)
{
clib_memcpy16 (dst, src);
clib_memcpy16 (dst + 16, src + 16);
dst += 32;
src += 32;
n -= 32;
}
clib_memcpy_const_le32 (dst, src, n);
#endif
}
static_always_inline void
clib_memcpy_x86_64_const (u8 *dst, u8 *src, size_t n)
{
#if defined(CLIB_HAVE_VEC512)
while (n > 128)
{
clib_memcpy64 (dst, src);
dst += 64;
src += 64;
n -= 64;
}
if (n < 64)
{
clib_memcpy_const_le64 (dst, src, n);
return;
}
switch (n)
{
case 64:
clib_memcpy64 (dst, src);
break;
case 65:
clib_memcpy64 (dst, src);
clib_memcpy1 (dst + 64, src + 64);
break;
case 66:
clib_memcpy64 (dst, src);
clib_memcpy2 (dst + 64, src + 64);
break;
case 68:
clib_memcpy64 (dst, src);
clib_memcpy4 (dst + 64, src + 64);
break;
case 72:
clib_memcpy64 (dst, src);
clib_memcpy8 (dst + 64, src + 64);
break;
case 80:
clib_memcpy64 (dst, src);
clib_memcpy16 (dst + 64, src + 64);
break;
case 96:
clib_memcpy64 (dst, src);
clib_memcpy32 (dst + 64, src + 64);
break;
default:
clib_memcpy64 (dst, src);
clib_memcpy64 (dst + n - 64, src + n - 64);
break;
}
#elif defined(CLIB_HAVE_VEC256)
while (n > 64)
{
clib_memcpy32 (dst, src);
dst += 32;
src += 32;
n -= 32;
}
clib_memcpy_const_le64 (dst, src, n);
#else
while (n > 32)
{
clib_memcpy16 (dst, src);
dst += 16;
src += 16;
n -= 16;
}
clib_memcpy_const_le32 (dst, src, n);
#endif
}
static_always_inline void *
clib_memcpy_x86_64 (void *restrict dst, const void *restrict src, size_t n)
{
u8 *d = (u8 *) dst, *s = (u8 *) src;
if (n == 0)
return dst;
if (COMPILE_TIME_CONST (n))
{
if (n)
clib_memcpy_x86_64_const (d, s, n);
return dst;
}
if (n <= 32)
{
#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
u32 mask = pow2_mask (n);
u8x32_mask_store (u8x32_mask_load_zero (s, mask), d, mask);
#else
if (PREDICT_TRUE (n >= 16))
{
clib_memcpy16 (d, s);
clib_memcpy16 (d + n - 16, s + n - 16);
}
else if (PREDICT_TRUE (n >= 8))
{
clib_memcpy8 (d, s);
clib_memcpy8 (d + n - 8, s + n - 8);
}
else if (PREDICT_TRUE (n >= 4))
{
clib_memcpy4 (d, s);
clib_memcpy4 (d + n - 4, s + n - 4);
}
else if (PREDICT_TRUE (n > 1))
{
clib_memcpy2 (d, s);
clib_memcpy2 (d + n - 2, s + n - 2);
}
else
clib_memcpy1 (d, s);
#endif
}
#ifdef CLIB_HAVE_VEC512
else
{
u8x64 v0, v1, v2, v3;
u64 final_off, nr, off = 64;
if (n <= 64)
{
n -= 32;
u8x32_store_unaligned (u8x32_load_unaligned (s), d);
u8x32_store_unaligned (u8x32_load_unaligned (s + n), d + n);
return dst;
}
u8x64_store_unaligned (u8x64_load_unaligned (s), d);
if (n <= 128)
goto done2;
if (n <= 192)
goto one;
if (n <= 512 + 64)
{
nr = round_pow2 (n - 128, 64);
goto last;
}
off -= ((u64) d) & 0x3f;
nr = round_pow2 (n - off - 64, 64);
final_off = (nr & ~(u64) 0x1ff) + off;
more:
v0 = u8x64_load_unaligned (s + off + 0x000);
v1 = u8x64_load_unaligned (s + off + 0x040);
v2 = u8x64_load_unaligned (s + off + 0x080);
v3 = u8x64_load_unaligned (s + off + 0x0c0);
u8x64_store_unaligned (v0, d + off + 0x000);
u8x64_store_unaligned (v1, d + off + 0x040);
u8x64_store_unaligned (v2, d + off + 0x080);
u8x64_store_unaligned (v3, d + off + 0x0c0);
v0 = u8x64_load_unaligned (s + off + 0x100);
v1 = u8x64_load_unaligned (s + off + 0x140);
v2 = u8x64_load_unaligned (s + off + 0x180);
v3 = u8x64_load_unaligned (s + off + 0x1c0);
u8x64_store_unaligned (v0, d + off + 0x100);
u8x64_store_unaligned (v1, d + off + 0x140);
u8x64_store_unaligned (v2, d + off + 0x180);
u8x64_store_unaligned (v3, d + off + 0x1c0);
off += 512;
if (off != final_off)
goto more;
if ((nr & 0x1ff) == 0)
goto done2;
last:
if (PREDICT_TRUE (nr & 256))
{
v0 = u8x64_load_unaligned (s + off + 0x000);
v1 = u8x64_load_unaligned (s + off + 0x040);
v2 = u8x64_load_unaligned (s + off + 0x080);
v3 = u8x64_load_unaligned (s + off + 0x0c0);
u8x64_store_unaligned (v0, d + off + 0x000);
u8x64_store_unaligned (v1, d + off + 0x040);
u8x64_store_unaligned (v2, d + off + 0x080);
u8x64_store_unaligned (v3, d + off + 0x0c0);
off += 256;
}
if (PREDICT_TRUE (nr & 128))
{
v0 = u8x64_load_unaligned (s + off + 0x000);
v1 = u8x64_load_unaligned (s + off + 0x040);
u8x64_store_unaligned (v0, d + off + 0x000);
u8x64_store_unaligned (v1, d + off + 0x040);
off += 128;
}
if (PREDICT_TRUE (nr & 64))
{
one:
u8x64_store_unaligned (u8x64_load_unaligned (s + off), d + off);
}
done2:
u8x64_store_unaligned (u8x64_load_unaligned (s + n - 64), d + n - 64);
}
return dst;
#elif defined(CLIB_HAVE_VEC256)
else
{
u8x32 v0, v1, v2, v3;
u64 final_off, nr, off = 32;
u8x32_store_unaligned (u8x32_load_unaligned (s), d);
if (n <= 64)
goto done2;
if (n <= 96)
goto one;
if (n <= 256 + 32)
{
nr = round_pow2 (n - 64, 32);
goto last;
}
off -= ((u64) d) & 0x1f;
nr = round_pow2 (n - off - 32, 32);
final_off = (nr & ~(u64) 0xff) + off;
more:
v0 = u8x32_load_unaligned (s + off + 0x00);
v1 = u8x32_load_unaligned (s + off + 0x20);
v2 = u8x32_load_unaligned (s + off + 0x40);
v3 = u8x32_load_unaligned (s + off + 0x60);
u8x32_store_unaligned (v0, d + off + 0x00);
u8x32_store_unaligned (v1, d + off + 0x20);
u8x32_store_unaligned (v2, d + off + 0x40);
u8x32_store_unaligned (v3, d + off + 0x60);
v0 = u8x32_load_unaligned (s + off + 0x80);
v1 = u8x32_load_unaligned (s + off + 0xa0);
v2 = u8x32_load_unaligned (s + off + 0xc0);
v3 = u8x32_load_unaligned (s + off + 0xe0);
u8x32_store_unaligned (v0, d + off + 0x80);
u8x32_store_unaligned (v1, d + off + 0xa0);
u8x32_store_unaligned (v2, d + off + 0xc0);
u8x32_store_unaligned (v3, d + off + 0xe0);
off += 256;
if (off != final_off)
goto more;
if ((nr & 0xff) == 0)
goto done2;
last:
if (PREDICT_TRUE (nr & 128))
{
v0 = u8x32_load_unaligned (s + off + 0x00);
v1 = u8x32_load_unaligned (s + off + 0x20);
v2 = u8x32_load_unaligned (s + off + 0x40);
v3 = u8x32_load_unaligned (s + off + 0x60);
u8x32_store_unaligned (v0, d + off + 0x00);
u8x32_store_unaligned (v1, d + off + 0x20);
u8x32_store_unaligned (v2, d + off + 0x40);
u8x32_store_unaligned (v3, d + off + 0x60);
off += 128;
}
if (PREDICT_TRUE (nr & 64))
{
v0 = u8x32_load_unaligned (s + off + 0x00);
v1 = u8x32_load_unaligned (s + off + 0x20);
u8x32_store_unaligned (v0, d + off + 0x00);
u8x32_store_unaligned (v1, d + off + 0x20);
off += 64;
}
if (PREDICT_TRUE (nr & 32))
{
one:
u8x32_store_unaligned (u8x32_load_unaligned (s + off), d + off);
}
done2:
u8x32_store_unaligned (u8x32_load_unaligned (s + n - 32), d + n - 32);
}
return dst;
#elif defined(CLIB_HAVE_VEC128)
else
{
u8x16 v0, v1, v2, v3;
u64 final_off, nr, off = 32;
if (0 && n > 389)
{
__builtin_memcpy (d, s, n);
return dst;
}
u8x16_store_unaligned (u8x16_load_unaligned (s), d);
u8x16_store_unaligned (u8x16_load_unaligned (s + 16), d + 16);
if (n <= 48)
goto done2;
if (n <= 64)
goto one;
if (n <= 256 + 32)
{
nr = round_pow2 (n - 48, 16);
goto last;
}
off -= ((u64) d) & 0x0f;
nr = round_pow2 (n - off - 16, 16);
final_off = (nr & ~(u64) 0xff) + off;
more:
v0 = u8x16_load_unaligned (s + off + 0x00);
v1 = u8x16_load_unaligned (s + off + 0x10);
v2 = u8x16_load_unaligned (s + off + 0x20);
v3 = u8x16_load_unaligned (s + off + 0x30);
u8x16_store_unaligned (v0, d + off + 0x00);
u8x16_store_unaligned (v1, d + off + 0x10);
u8x16_store_unaligned (v2, d + off + 0x20);
u8x16_store_unaligned (v3, d + off + 0x30);
v0 = u8x16_load_unaligned (s + off + 0x40);
v1 = u8x16_load_unaligned (s + off + 0x50);
v2 = u8x16_load_unaligned (s + off + 0x60);
v3 = u8x16_load_unaligned (s + off + 0x70);
u8x16_store_unaligned (v0, d + off + 0x40);
u8x16_store_unaligned (v1, d + off + 0x50);
u8x16_store_unaligned (v2, d + off + 0x60);
u8x16_store_unaligned (v3, d + off + 0x70);
v0 = u8x16_load_unaligned (s + off + 0x80);
v1 = u8x16_load_unaligned (s + off + 0x90);
v2 = u8x16_load_unaligned (s + off + 0xa0);
v3 = u8x16_load_unaligned (s + off + 0xb0);
u8x16_store_unaligned (v0, d + off + 0x80);
u8x16_store_unaligned (v1, d + off + 0x90);
u8x16_store_unaligned (v2, d + off + 0xa0);
u8x16_store_unaligned (v3, d + off + 0xb0);
v0 = u8x16_load_unaligned (s + off + 0xc0);
v1 = u8x16_load_unaligned (s + off + 0xd0);
v2 = u8x16_load_unaligned (s + off + 0xe0);
v3 = u8x16_load_unaligned (s + off + 0xf0);
u8x16_store_unaligned (v0, d + off + 0xc0);
u8x16_store_unaligned (v1, d + off + 0xd0);
u8x16_store_unaligned (v2, d + off + 0xe0);
u8x16_store_unaligned (v3, d + off + 0xf0);
off += 256;
if (off != final_off)
goto more;
if ((nr & 0xff) == 0)
goto done2;
last:
if (PREDICT_TRUE (nr & 128))
{
v0 = u8x16_load_unaligned (s + off + 0x00);
v1 = u8x16_load_unaligned (s + off + 0x10);
v2 = u8x16_load_unaligned (s + off + 0x20);
v3 = u8x16_load_unaligned (s + off + 0x30);
u8x16_store_unaligned (v0, d + off + 0x00);
u8x16_store_unaligned (v1, d + off + 0x10);
u8x16_store_unaligned (v2, d + off + 0x20);
u8x16_store_unaligned (v3, d + off + 0x30);
v0 = u8x16_load_unaligned (s + off + 0x40);
v1 = u8x16_load_unaligned (s + off + 0x50);
v2 = u8x16_load_unaligned (s + off + 0x60);
v3 = u8x16_load_unaligned (s + off + 0x70);
u8x16_store_unaligned (v0, d + off + 0x40);
u8x16_store_unaligned (v1, d + off + 0x50);
u8x16_store_unaligned (v2, d + off + 0x60);
u8x16_store_unaligned (v3, d + off + 0x70);
off += 128;
}
if (PREDICT_TRUE (nr & 64))
{
v0 = u8x16_load_unaligned (s + off + 0x00);
v1 = u8x16_load_unaligned (s + off + 0x10);
v2 = u8x16_load_unaligned (s + off + 0x20);
v3 = u8x16_load_unaligned (s + off + 0x30);
u8x16_store_unaligned (v0, d + off + 0x00);
u8x16_store_unaligned (v1, d + off + 0x10);
u8x16_store_unaligned (v2, d + off + 0x20);
u8x16_store_unaligned (v3, d + off + 0x30);
off += 64;
}
if (PREDICT_TRUE (nr & 32))
{
v0 = u8x16_load_unaligned (s + off + 0x00);
v1 = u8x16_load_unaligned (s + off + 0x10);
u8x16_store_unaligned (v0, d + off + 0x00);
u8x16_store_unaligned (v1, d + off + 0x10);
off += 32;
}
if (PREDICT_TRUE (nr & 16))
{
one:
u8x16_store_unaligned (u8x16_load_unaligned (s + off), d + off);
}
done2:
u8x16_store_unaligned (u8x16_load_unaligned (s + n - 16), d + n - 16);
}
return dst;
#else
#error "SSE/AVX2/AVX512 must be enabled"
#endif
return dst;
}
/* clang-format off */
WARN_ON (stringop-overflow)
/* clang-format on */
#endif
#endif