ip: enhance vtep4_check of tunnel by vector way
This patch aims to improve decap performance by reducing expensive
hash_get callings as less as possible using AVX512 on XEON.
e.g. vxlan, vxlan_gpe, geneve, gtpu.
For the existing code, if vtep4 of the current packet match the last
vtep4_key_t well, expensive hash computation can be avoided and the
code returns directly.
This patch improves tunnel decap multiple flows case greatly by
leveraging 512bit vector register on XEON accommodating 8 vtep4_keys.
It enhances the possiblity of avoiding unnecessary hash computing
once hash key of the current packet hits any one of 8 in the 512bit
cache.
The oldest element in vtep4_cache_t is updated in round-robin order.
vlib_get_buffers is also leveraged in the meanwhile.
Type: improvement
Signed-off-by: Zhiyong Yang <zhiyong.yang@intel.com>
Signed-off-by: Ray Kinsella <mdr@ashroe.eu>
Signed-off-by: Junfeng Wang <drenfong.wang@intel.com>
Change-Id: I313103202bd76f2dd638cd942554721b37ddad60
diff --git a/src/plugins/gtpu/gtpu_decap.c b/src/plugins/gtpu/gtpu_decap.c
index 05c2138..7a88aae 100644
--- a/src/plugins/gtpu/gtpu_decap.c
+++ b/src/plugins/gtpu/gtpu_decap.c
@@ -804,10 +804,16 @@
matching a local VTEP address */
vtep6_key_t last_vtep6; /* last IPv6 address / fib index
matching a local VTEP address */
+ vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs;
+#ifdef CLIB_HAVE_VEC512
+ vtep4_cache_t vtep4_u512;
+ clib_memset (&vtep4_u512, 0, sizeof (vtep4_u512));
+#endif
from = vlib_frame_vector_args (frame);
n_left_from = frame->n_vectors;
next_index = node->cached_next_index;
+ vlib_get_buffers (vm, from, bufs, n_left_from);
if (node->flags & VLIB_NODE_FLAG_TRACE)
ip4_forward_next_trace (vm, node, frame, VLIB_TX);
@@ -835,16 +841,11 @@
/* Prefetch next iteration. */
{
- vlib_buffer_t * p2, * p3;
+ vlib_prefetch_buffer_header (b[2], LOAD);
+ vlib_prefetch_buffer_header (b[3], LOAD);
- p2 = vlib_get_buffer (vm, from[2]);
- p3 = vlib_get_buffer (vm, from[3]);
-
- vlib_prefetch_buffer_header (p2, LOAD);
- vlib_prefetch_buffer_header (p3, LOAD);
-
- CLIB_PREFETCH (p2->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
- CLIB_PREFETCH (p3->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (b[2]->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (b[3]->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
}
bi0 = to_next[0] = from[0];
@@ -854,8 +855,9 @@
to_next += 2;
n_left_to_next -= 2;
- b0 = vlib_get_buffer (vm, bi0);
- b1 = vlib_get_buffer (vm, bi1);
+ b0 = b[0];
+ b1 = b[1];
+ b += 2;
if (is_ip4)
{
ip40 = vlib_buffer_get_current (b0);
@@ -899,7 +901,12 @@
/* Validate DIP against VTEPs*/
if (is_ip4)
{
+#ifdef CLIB_HAVE_VEC512
+ if (!vtep4_check_vector
+ (>m->vtep_table, b0, ip40, &last_vtep4, &vtep4_u512))
+#else
if (!vtep4_check (>m->vtep_table, b0, ip40, &last_vtep4))
+#endif
goto exit0; /* no local VTEP for GTPU packet */
}
else
@@ -973,7 +980,12 @@
/* Validate DIP against VTEPs*/
if (is_ip4)
{
+#ifdef CLIB_HAVE_VEC512
+ if (!vtep4_check_vector
+ (>m->vtep_table, b1, ip41, &last_vtep4, &vtep4_u512))
+#else
if (!vtep4_check (>m->vtep_table, b1, ip41, &last_vtep4))
+#endif
goto exit1; /* no local VTEP for GTPU packet */
}
else
@@ -1053,7 +1065,8 @@
to_next += 1;
n_left_to_next -= 1;
- b0 = vlib_get_buffer (vm, bi0);
+ b0 = b[0];
+ b++;
if (is_ip4)
ip40 = vlib_buffer_get_current (b0);
else
@@ -1083,7 +1096,12 @@
/* Validate DIP against VTEPs*/
if (is_ip4)
{
+#ifdef CLIB_HAVE_VEC512
+ if (!vtep4_check_vector
+ (>m->vtep_table, b0, ip40, &last_vtep4, &vtep4_u512))
+#else
if (!vtep4_check (>m->vtep_table, b0, ip40, &last_vtep4))
+#endif
goto exit; /* no local VTEP for GTPU packet */
}
else
diff --git a/src/vnet/geneve/decap.c b/src/vnet/geneve/decap.c
index 10a17ce..b570e35 100644
--- a/src/vnet/geneve/decap.c
+++ b/src/vnet/geneve/decap.c
@@ -869,11 +869,18 @@
matching a local VTEP address */
vtep6_key_t last_vtep6; /* last IPv6 address / fib index
matching a local VTEP address */
+ vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs;
+#ifdef CLIB_HAVE_VEC512
+ vtep4_cache_t vtep4_u512;
+ clib_memset (&vtep4_u512, 0, sizeof (vtep4_u512));
+#endif
from = vlib_frame_vector_args (frame);
n_left_from = frame->n_vectors;
next_index = node->cached_next_index;
+ vlib_get_buffers (vm, from, bufs, n_left_from);
+
if (node->flags & VLIB_NODE_FLAG_TRACE)
ip4_forward_next_trace (vm, node, frame, VLIB_TX);
@@ -900,16 +907,11 @@
/* Prefetch next iteration. */
{
- vlib_buffer_t *p2, *p3;
+ vlib_prefetch_buffer_header (b[2], LOAD);
+ vlib_prefetch_buffer_header (b[3], LOAD);
- p2 = vlib_get_buffer (vm, from[2]);
- p3 = vlib_get_buffer (vm, from[3]);
-
- vlib_prefetch_buffer_header (p2, LOAD);
- vlib_prefetch_buffer_header (p3, LOAD);
-
- CLIB_PREFETCH (p2->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
- CLIB_PREFETCH (p3->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (b[2]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (b[3]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
}
bi0 = to_next[0] = from[0];
@@ -919,8 +921,9 @@
to_next += 2;
n_left_to_next -= 2;
- b0 = vlib_get_buffer (vm, bi0);
- b1 = vlib_get_buffer (vm, bi1);
+ b0 = b[0];
+ b1 = b[1];
+ b += 2;
if (is_ip4)
{
ip40 = vlib_buffer_get_current (b0);
@@ -964,7 +967,12 @@
/* Validate DIP against VTEPs */
if (is_ip4)
{
+#ifdef CLIB_HAVE_VEC512
+ if (!vtep4_check_vector
+ (&vxm->vtep_table, b0, ip40, &last_vtep4, &vtep4_u512))
+#else
if (!vtep4_check (&vxm->vtep_table, b0, ip40, &last_vtep4))
+#endif
goto exit0; /* no local VTEP for GENEVE packet */
}
else
@@ -1042,7 +1050,12 @@
/* Validate DIP against VTEPs */
if (is_ip4)
{
+#ifdef CLIB_HAVE_VEC512
+ if (!vtep4_check_vector
+ (&vxm->vtep_table, b1, ip41, &last_vtep4, &vtep4_u512))
+#else
if (!vtep4_check (&vxm->vtep_table, b1, ip41, &last_vtep4))
+#endif
goto exit1; /* no local VTEP for GENEVE packet */
}
else
@@ -1126,7 +1139,8 @@
to_next += 1;
n_left_to_next -= 1;
- b0 = vlib_get_buffer (vm, bi0);
+ b0 = b[0];
+ b++;
if (is_ip4)
ip40 = vlib_buffer_get_current (b0);
else
@@ -1156,7 +1170,12 @@
/* Validate DIP against VTEPs */
if (is_ip4)
{
+#ifdef CLIB_HAVE_VEC512
+ if (!vtep4_check_vector
+ (&vxm->vtep_table, b0, ip40, &last_vtep4, &vtep4_u512))
+#else
if (!vtep4_check (&vxm->vtep_table, b0, ip40, &last_vtep4))
+#endif
goto exit; /* no local VTEP for GENEVE packet */
}
else
diff --git a/src/vnet/ip/vtep.h b/src/vnet/ip/vtep.h
index 703ace1..345b6db 100644
--- a/src/vnet/ip/vtep.h
+++ b/src/vnet/ip/vtep.h
@@ -112,6 +112,48 @@
return VTEP_CHECK_PASS;
}
+typedef struct
+{
+ vtep4_key_t vtep4_cache[8];
+ int idx;
+} vtep4_cache_t;
+
+always_inline u8
+vtep4_check_vector (vtep_table_t * t, vlib_buffer_t * b0, ip4_header_t * ip40,
+ vtep4_key_t * last_k4, vtep4_cache_t * vtep4_u512)
+{
+ vtep4_key_t k4;
+ k4.addr.as_u32 = ip40->dst_address.as_u32;
+ k4.fib_index = vlib_buffer_get_ip4_fib_index (b0);
+
+ if (PREDICT_TRUE (k4.as_u64 == last_k4->as_u64))
+ return VTEP_CHECK_PASS_UNCHANGED;
+
+#ifdef CLIB_HAVE_VEC512
+ u64x8 k4_u64x8 = u64x8_splat (k4.as_u64);
+ u64x8 cache = u64x8_load_aligned (vtep4_u512->vtep4_cache);
+ u8 result = u64x8_mask_is_equal (cache, k4_u64x8);
+ if (PREDICT_TRUE (result != 0))
+ {
+ k4.as_u64 =
+ vtep4_u512->vtep4_cache[count_trailing_zeros (result)].as_u64;
+ return VTEP_CHECK_PASS_UNCHANGED;
+ }
+#endif
+
+ if (PREDICT_FALSE (!hash_get (t->vtep4, k4.as_u64)))
+ return VTEP_CHECK_FAIL;
+
+ last_k4->as_u64 = k4.as_u64;
+
+#ifdef CLIB_HAVE_VEC512
+ vtep4_u512->vtep4_cache[vtep4_u512->idx].as_u64 = k4.as_u64;
+ vtep4_u512->idx = (vtep4_u512->idx + 1) & 0x7;
+#endif
+
+ return VTEP_CHECK_PASS;
+}
+
always_inline u8
vtep6_check (vtep_table_t * t, vlib_buffer_t * b0, ip6_header_t * ip60,
vtep6_key_t * last_k6)
diff --git a/src/vnet/vxlan-gpe/decap.c b/src/vnet/vxlan-gpe/decap.c
index f2961d5..77b5328 100644
--- a/src/vnet/vxlan-gpe/decap.c
+++ b/src/vnet/vxlan-gpe/decap.c
@@ -792,11 +792,18 @@
matching a local VTEP address */
vtep6_key_t last_vtep6; /* last IPv6 address / fib index
matching a local VTEP address */
+ vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs;
+#ifdef CLIB_HAVE_VEC512
+ vtep4_cache_t vtep4_u512;
+ clib_memset (&vtep4_u512, 0, sizeof (vtep4_u512));
+#endif
from = vlib_frame_vector_args (frame);
n_left_from = frame->n_vectors;
next_index = node->cached_next_index;
+ vlib_get_buffers (vm, from, bufs, n_left_from);
+
if (node->flags & VLIB_NODE_FLAG_TRACE)
ip4_forward_next_trace (vm, node, frame, VLIB_TX);
@@ -823,16 +830,11 @@
/* Prefetch next iteration. */
{
- vlib_buffer_t *p2, *p3;
+ vlib_prefetch_buffer_header (b[2], LOAD);
+ vlib_prefetch_buffer_header (b[3], LOAD);
- p2 = vlib_get_buffer (vm, from[2]);
- p3 = vlib_get_buffer (vm, from[3]);
-
- vlib_prefetch_buffer_header (p2, LOAD);
- vlib_prefetch_buffer_header (p3, LOAD);
-
- CLIB_PREFETCH (p2->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
- CLIB_PREFETCH (p3->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (b[2]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (b[3]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
}
bi0 = to_next[0] = from[0];
@@ -842,8 +844,9 @@
to_next += 2;
n_left_to_next -= 2;
- b0 = vlib_get_buffer (vm, bi0);
- b1 = vlib_get_buffer (vm, bi1);
+ b0 = b[0];
+ b1 = b[1];
+ b += 2;
if (is_ip4)
{
ip40 = vlib_buffer_get_current (b0);
@@ -885,7 +888,12 @@
/* Validate DIP against VTEPs */
if (is_ip4)
{
+#ifdef CLIB_HAVE_VEC512
+ if (!vtep4_check_vector
+ (&ngm->vtep_table, b0, ip40, &last_vtep4, &vtep4_u512))
+#else
if (!vtep4_check (&ngm->vtep_table, b0, ip40, &last_vtep4))
+#endif
goto exit0; /* no local VTEP for VXLAN packet */
}
else
@@ -963,7 +971,12 @@
/* Validate DIP against VTEPs */
if (is_ip4)
{
+#ifdef CLIB_HAVE_VEC512
+ if (!vtep4_check_vector
+ (&ngm->vtep_table, b1, ip41, &last_vtep4, &vtep4_u512))
+#else
if (!vtep4_check (&ngm->vtep_table, b1, ip41, &last_vtep4))
+#endif
goto exit1; /* no local VTEP for VXLAN packet */
}
else
@@ -1047,7 +1060,8 @@
to_next += 1;
n_left_to_next -= 1;
- b0 = vlib_get_buffer (vm, bi0);
+ b0 = b[0];
+ b++;
if (is_ip4)
ip40 = vlib_buffer_get_current (b0);
else
@@ -1073,9 +1087,15 @@
goto exit; /* not VXLAN packet */
/* Validate DIP against VTEPs */
+
if (is_ip4)
{
+#ifdef CLIB_HAVE_VEC512
+ if (!vtep4_check_vector
+ (&ngm->vtep_table, b0, ip40, &last_vtep4, &vtep4_u512))
+#else
if (!vtep4_check (&ngm->vtep_table, b0, ip40, &last_vtep4))
+#endif
goto exit; /* no local VTEP for VXLAN packet */
}
else
diff --git a/src/vnet/vxlan/decap.c b/src/vnet/vxlan/decap.c
index d2ba62d..e41c6a9 100644
--- a/src/vnet/vxlan/decap.c
+++ b/src/vnet/vxlan/decap.c
@@ -458,11 +458,19 @@
matching a local VTEP address */
vtep6_key_t last_vtep6; /* last IPv6 address / fib index
matching a local VTEP address */
+ vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs;
+
+#ifdef CLIB_HAVE_VEC512
+ vtep4_cache_t vtep4_u512;
+ clib_memset (&vtep4_u512, 0, sizeof (vtep4_u512));
+#endif
from = vlib_frame_vector_args (frame);
n_left_from = frame->n_vectors;
next_index = node->cached_next_index;
+ vlib_get_buffers (vm, from, bufs, n_left_from);
+
if (node->flags & VLIB_NODE_FLAG_TRACE)
ip4_forward_next_trace (vm, node, frame, VLIB_TX);
@@ -489,16 +497,11 @@
/* Prefetch next iteration. */
{
- vlib_buffer_t *p2, *p3;
+ vlib_prefetch_buffer_header (b[2], LOAD);
+ vlib_prefetch_buffer_header (b[3], LOAD);
- p2 = vlib_get_buffer (vm, from[2]);
- p3 = vlib_get_buffer (vm, from[3]);
-
- vlib_prefetch_buffer_header (p2, LOAD);
- vlib_prefetch_buffer_header (p3, LOAD);
-
- CLIB_PREFETCH (p2->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
- CLIB_PREFETCH (p3->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (b[2]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (b[3]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
}
bi0 = to_next[0] = from[0];
@@ -508,8 +511,9 @@
to_next += 2;
n_left_to_next -= 2;
- b0 = vlib_get_buffer (vm, bi0);
- b1 = vlib_get_buffer (vm, bi1);
+ b0 = b[0];
+ b1 = b[1];
+ b += 2;
if (is_ip4)
{
ip40 = vlib_buffer_get_current (b0);
@@ -553,7 +557,12 @@
/* Validate DIP against VTEPs */
if (is_ip4)
{
+#ifdef CLIB_HAVE_VEC512
+ if (!vtep4_check_vector
+ (&vxm->vtep_table, b0, ip40, &last_vtep4, &vtep4_u512))
+#else
if (!vtep4_check (&vxm->vtep_table, b0, ip40, &last_vtep4))
+#endif
goto exit0; /* no local VTEP for VXLAN packet */
}
else
@@ -627,7 +636,12 @@
/* Validate DIP against VTEPs */
if (is_ip4)
{
+#ifdef CLIB_HAVE_VEC512
+ if (!vtep4_check_vector
+ (&vxm->vtep_table, b1, ip41, &last_vtep4, &vtep4_u512))
+#else
if (!vtep4_check (&vxm->vtep_table, b1, ip41, &last_vtep4))
+#endif
goto exit1; /* no local VTEP for VXLAN packet */
}
else
@@ -707,7 +721,8 @@
to_next += 1;
n_left_to_next -= 1;
- b0 = vlib_get_buffer (vm, bi0);
+ b0 = b[0];
+ b++;
if (is_ip4)
ip40 = vlib_buffer_get_current (b0);
else
@@ -737,7 +752,12 @@
/* Validate DIP against VTEPs */
if (is_ip4)
{
+#ifdef CLIB_HAVE_VEC512
+ if (!vtep4_check_vector
+ (&vxm->vtep_table, b0, ip40, &last_vtep4, &vtep4_u512))
+#else
if (!vtep4_check (&vxm->vtep_table, b0, ip40, &last_vtep4))
+#endif
goto exit; /* no local VTEP for VXLAN packet */
}
else
diff --git a/src/vppinfra/vector_avx512.h b/src/vppinfra/vector_avx512.h
index 6eb7c5e..a51644b 100644
--- a/src/vppinfra/vector_avx512.h
+++ b/src/vppinfra/vector_avx512.h
@@ -246,6 +246,12 @@
return (u8x64) _mm512_mask_blend_epi8 (mask, (__m512i) a, (__m512i) b);
}
+static_always_inline u8
+u64x8_mask_is_equal (u64x8 a, u64x8 b)
+{
+ return _mm512_cmpeq_epu64_mask ((__m512i) a, (__m512i) b);
+}
+
static_always_inline void
u32x16_transpose (u32x16 m[16])
{