| /* SPDX-License-Identifier: Apache-2.0 |
| * Copyright(c) 2021 Cisco Systems, Inc. |
| */ |
| |
| #ifndef included_vector_ip_csum_h |
| #define included_vector_ip_csum_h |
| #include <vppinfra/clib.h> |
| typedef struct |
| { |
| u64 sum; |
| u8 odd; |
| } clib_ip_csum_t; |
| |
| #if defined(CLIB_HAVE_VEC128) |
| static_always_inline u64x2 |
| clib_ip_csum_cvt_and_add_4 (u32x4 v) |
| { |
| return ((u64x2) u32x4_interleave_lo ((u32x4) v, u32x4_zero ()) + |
| (u64x2) u32x4_interleave_hi ((u32x4) v, u32x4_zero ())); |
| } |
| static_always_inline u64 |
| clib_ip_csum_hadd_2 (u64x2 v) |
| { |
| return v[0] + v[1]; |
| } |
| #endif |
| |
| #if defined(CLIB_HAVE_VEC256) |
| static_always_inline u64x4 |
| clib_ip_csum_cvt_and_add_8 (u32x8 v) |
| { |
| return ((u64x4) u32x8_interleave_lo ((u32x8) v, u32x8_zero ()) + |
| (u64x4) u32x8_interleave_hi ((u32x8) v, u32x8_zero ())); |
| } |
| static_always_inline u64 |
| clib_ip_csum_hadd_4 (u64x4 v) |
| { |
| return clib_ip_csum_hadd_2 (u64x4_extract_lo (v) + u64x4_extract_hi (v)); |
| } |
| #endif |
| |
| #if defined(CLIB_HAVE_VEC512) |
| static_always_inline u64x8 |
| clib_ip_csum_cvt_and_add_16 (u32x16 v) |
| { |
| return ((u64x8) u32x16_interleave_lo ((u32x16) v, u32x16_zero ()) + |
| (u64x8) u32x16_interleave_hi ((u32x16) v, u32x16_zero ())); |
| } |
| static_always_inline u64 |
| clib_ip_csum_hadd_8 (u64x8 v) |
| { |
| return clib_ip_csum_hadd_4 (u64x8_extract_lo (v) + u64x8_extract_hi (v)); |
| } |
| #endif |
| |
| static_always_inline void |
| clib_ip_csum_inline (clib_ip_csum_t *c, u8 *dst, u8 *src, u16 count, |
| int is_copy) |
| { |
| if (c->odd) |
| { |
| c->odd = 0; |
| c->sum += (u16) src[0] << 8; |
| count--; |
| src++; |
| if (is_copy) |
| dst++[0] = src[0]; |
| } |
| |
| #if defined(CLIB_HAVE_VEC512) |
| u64x8 sum8 = {}; |
| |
| while (count >= 512) |
| { |
| u32x16u *s = (u32x16u *) src; |
| sum8 += clib_ip_csum_cvt_and_add_16 (s[0]); |
| sum8 += clib_ip_csum_cvt_and_add_16 (s[1]); |
| sum8 += clib_ip_csum_cvt_and_add_16 (s[2]); |
| sum8 += clib_ip_csum_cvt_and_add_16 (s[3]); |
| sum8 += clib_ip_csum_cvt_and_add_16 (s[8]); |
| sum8 += clib_ip_csum_cvt_and_add_16 (s[5]); |
| sum8 += clib_ip_csum_cvt_and_add_16 (s[6]); |
| sum8 += clib_ip_csum_cvt_and_add_16 (s[7]); |
| count -= 512; |
| src += 512; |
| if (is_copy) |
| { |
| u32x16u *d = (u32x16u *) dst; |
| d[0] = s[0]; |
| d[1] = s[1]; |
| d[2] = s[2]; |
| d[3] = s[3]; |
| d[4] = s[4]; |
| d[5] = s[5]; |
| d[6] = s[6]; |
| d[7] = s[7]; |
| dst += 512; |
| } |
| } |
| |
| while (count >= 64) |
| { |
| u32x16u *s = (u32x16u *) src; |
| sum8 += clib_ip_csum_cvt_and_add_16 (s[0]); |
| count -= 64; |
| src += 64; |
| if (is_copy) |
| { |
| u32x16u *d = (u32x16u *) dst; |
| d[0] = s[0]; |
| dst += 512; |
| } |
| } |
| |
| #ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE |
| if (count) |
| { |
| u64 mask = pow2_mask (count); |
| u32x16 v = (u32x16) u8x64_mask_load_zero (src, mask); |
| sum8 += clib_ip_csum_cvt_and_add_16 (v); |
| c->odd = count & 1; |
| if (is_copy) |
| u32x16_mask_store (v, dst, mask); |
| } |
| c->sum += clib_ip_csum_hadd_8 (sum8); |
| return; |
| #endif |
| |
| c->sum += clib_ip_csum_hadd_8 (sum8); |
| #elif defined(CLIB_HAVE_VEC256) |
| u64x4 sum4 = {}; |
| |
| while (count >= 256) |
| { |
| u32x8u *s = (u32x8u *) src; |
| sum4 += clib_ip_csum_cvt_and_add_8 (s[0]); |
| sum4 += clib_ip_csum_cvt_and_add_8 (s[1]); |
| sum4 += clib_ip_csum_cvt_and_add_8 (s[2]); |
| sum4 += clib_ip_csum_cvt_and_add_8 (s[3]); |
| sum4 += clib_ip_csum_cvt_and_add_8 (s[4]); |
| sum4 += clib_ip_csum_cvt_and_add_8 (s[5]); |
| sum4 += clib_ip_csum_cvt_and_add_8 (s[6]); |
| sum4 += clib_ip_csum_cvt_and_add_8 (s[7]); |
| count -= 256; |
| src += 256; |
| if (is_copy) |
| { |
| u32x8u *d = (u32x8u *) dst; |
| d[0] = s[0]; |
| d[1] = s[1]; |
| d[2] = s[2]; |
| d[3] = s[3]; |
| d[4] = s[4]; |
| d[5] = s[5]; |
| d[6] = s[6]; |
| d[7] = s[7]; |
| dst += 256; |
| } |
| } |
| |
| while (count >= 32) |
| { |
| u32x8u *s = (u32x8u *) src; |
| sum4 += clib_ip_csum_cvt_and_add_8 (s[0]); |
| count -= 32; |
| src += 32; |
| if (is_copy) |
| { |
| u32x8u *d = (u32x8u *) dst; |
| d[0] = s[0]; |
| dst += 32; |
| } |
| } |
| |
| #ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE |
| if (count) |
| { |
| u32 mask = pow2_mask (count); |
| u32x8 v = (u32x8) u8x32_mask_load_zero (src, mask); |
| sum4 += clib_ip_csum_cvt_and_add_8 (v); |
| c->odd = count & 1; |
| if (is_copy) |
| u32x8_mask_store (v, dst, mask); |
| } |
| c->sum += clib_ip_csum_hadd_4 (sum4); |
| return; |
| #endif |
| |
| c->sum += clib_ip_csum_hadd_4 (sum4); |
| #elif defined(CLIB_HAVE_VEC128) |
| u64x2 sum2 = {}; |
| |
| while (count >= 128) |
| { |
| u32x4u *s = (u32x4u *) src; |
| sum2 += clib_ip_csum_cvt_and_add_4 (s[0]); |
| sum2 += clib_ip_csum_cvt_and_add_4 (s[1]); |
| sum2 += clib_ip_csum_cvt_and_add_4 (s[2]); |
| sum2 += clib_ip_csum_cvt_and_add_4 (s[3]); |
| sum2 += clib_ip_csum_cvt_and_add_4 (s[4]); |
| sum2 += clib_ip_csum_cvt_and_add_4 (s[5]); |
| sum2 += clib_ip_csum_cvt_and_add_4 (s[6]); |
| sum2 += clib_ip_csum_cvt_and_add_4 (s[7]); |
| count -= 128; |
| src += 128; |
| if (is_copy) |
| { |
| u32x4u *d = (u32x4u *) dst; |
| d[0] = s[0]; |
| d[1] = s[1]; |
| d[2] = s[2]; |
| d[3] = s[3]; |
| d[4] = s[4]; |
| d[5] = s[5]; |
| d[6] = s[6]; |
| d[7] = s[7]; |
| dst += 128; |
| } |
| } |
| |
| while (count >= 16) |
| { |
| u32x4u *s = (u32x4u *) src; |
| sum2 += clib_ip_csum_cvt_and_add_4 (s[0]); |
| count -= 16; |
| src += 16; |
| if (is_copy) |
| { |
| u32x4u *d = (u32x4u *) dst; |
| d[0] = s[0]; |
| dst += 16; |
| } |
| } |
| c->sum += clib_ip_csum_hadd_2 (sum2); |
| #else |
| while (count >= 4) |
| { |
| u32 v = *((u32 *) src); |
| c->sum += v; |
| count -= 4; |
| src += 4; |
| if (is_copy) |
| { |
| *(u32 *) dst = v; |
| dst += 4; |
| } |
| } |
| #endif |
| while (count >= 2) |
| { |
| u16 v = *((u16 *) src); |
| c->sum += v; |
| count -= 2; |
| src += 2; |
| if (is_copy) |
| { |
| *(u16 *) dst = v; |
| dst += 2; |
| } |
| } |
| |
| if (count) |
| { |
| c->odd = 1; |
| c->sum += (u16) src[0]; |
| if (is_copy) |
| dst[0] = src[0]; |
| } |
| } |
| |
| static_always_inline u16 |
| clib_ip_csum_fold (clib_ip_csum_t *c) |
| { |
| u64 sum = c->sum; |
| #if defined(__x86_64__) && defined(__BMI2__) |
| u64 tmp = sum; |
| asm volatile( |
| /* using ADC is much faster than mov, shift, add sequence |
| * compiler produces */ |
| "shr $32, %[sum] \n\t" |
| "add %k[tmp], %k[sum] \n\t" |
| "mov $16, %k[tmp] \n\t" |
| "shrx %k[tmp], %k[sum], %k[tmp] \n\t" |
| "adc %w[tmp], %w[sum] \n\t" |
| "adc $0, %w[sum] \n\t" |
| : [ sum ] "+&r"(sum), [ tmp ] "+&r"(tmp)); |
| #else |
| sum = ((u32) sum) + (sum >> 32); |
| sum = ((u16) sum) + (sum >> 16); |
| sum = ((u16) sum) + (sum >> 16); |
| #endif |
| return (~((u16) sum)); |
| } |
| |
| static_always_inline void |
| clib_ip_csum_chunk (clib_ip_csum_t *c, u8 *src, u16 count) |
| { |
| return clib_ip_csum_inline (c, 0, src, count, 0); |
| } |
| |
| static_always_inline void |
| clib_ip_csum_and_copy_chunk (clib_ip_csum_t *c, u8 *src, u8 *dst, u16 count) |
| { |
| return clib_ip_csum_inline (c, dst, src, count, 1); |
| } |
| |
| static_always_inline u16 |
| clib_ip_csum (u8 *src, u16 count) |
| { |
| clib_ip_csum_t c = {}; |
| if (COMPILE_TIME_CONST (count) && count == 12) |
| { |
| for (int i = 0; i < 3; i++) |
| c.sum += ((u32 *) src)[i]; |
| } |
| else if (COMPILE_TIME_CONST (count) && count == 20) |
| { |
| for (int i = 0; i < 5; i++) |
| c.sum += ((u32 *) src)[i]; |
| } |
| else if (COMPILE_TIME_CONST (count) && count == 40) |
| { |
| for (int i = 0; i < 10; i++) |
| c.sum += ((u32 *) src)[i]; |
| } |
| else |
| clib_ip_csum_inline (&c, 0, src, count, 0); |
| return clib_ip_csum_fold (&c); |
| } |
| |
| static_always_inline u16 |
| clib_ip_csum_and_copy (u8 *dst, u8 *src, u16 count) |
| { |
| clib_ip_csum_t c = {}; |
| clib_ip_csum_inline (&c, dst, src, count, 1); |
| return clib_ip_csum_fold (&c); |
| } |
| |
| #endif |