avf: 512-bit SIMD version of avf_tx_prepare
Exploiting AVX-512 operations on avf_tx_prepare().
Type: improvement
Signed-off-by: Leyi Rong <leyi.rong@intel.com>
Change-Id: I01e0b4a2e2d440659b4298668a868d983f5091c3
diff --git a/src/plugins/avf/output.c b/src/plugins/avf/output.c
index 116b055..daa86ae 100644
--- a/src/plugins/avf/output.c
+++ b/src/plugins/avf/output.c
@@ -230,7 +230,11 @@
{
const u64 cmd_eop = AVF_TXD_CMD_EOP;
u16 n_free_desc, n_desc_left, n_packets_left = n_packets;
+#if defined CLIB_HAVE_VEC512
+ vlib_buffer_t *b[8];
+#else
vlib_buffer_t *b[4];
+#endif
avf_tx_desc_t *d = txq->tmp_descs;
u32 *tb = txq->tmp_bufs;
@@ -241,11 +245,30 @@
while (n_packets_left && n_desc_left)
{
+#if defined CLIB_HAVE_VEC512
+ u32 flags;
+ u64x8 or_flags_vec512;
+ u64x8 flags_mask_vec512;
+#else
u32 flags, or_flags;
+#endif
+#if defined CLIB_HAVE_VEC512
+ if (n_packets_left < 8 || n_desc_left < 8)
+#else
if (n_packets_left < 8 || n_desc_left < 4)
+#endif
goto one_by_one;
+#if defined CLIB_HAVE_VEC512
+ u64x8 base_ptr = u64x8_splat (vm->buffer_main->buffer_mem_start);
+ u32x8 buf_indices = u32x8_load_unaligned (buffers);
+
+ *(u64x8 *) &b = base_ptr + u64x8_from_u32x8 (
+ buf_indices << CLIB_LOG2_CACHE_LINE_BYTES);
+
+ or_flags_vec512 = u64x8_i64gather (u64x8_load_unaligned (b), 0, 1);
+#else
vlib_prefetch_buffer_with_index (vm, buffers[4], LOAD);
vlib_prefetch_buffer_with_index (vm, buffers[5], LOAD);
vlib_prefetch_buffer_with_index (vm, buffers[6], LOAD);
@@ -257,12 +280,37 @@
b[3] = vlib_get_buffer (vm, buffers[3]);
or_flags = b[0]->flags | b[1]->flags | b[2]->flags | b[3]->flags;
+#endif
+#if defined CLIB_HAVE_VEC512
+ flags_mask_vec512 = u64x8_splat (
+ VLIB_BUFFER_NEXT_PRESENT | VNET_BUFFER_F_OFFLOAD | VNET_BUFFER_F_GSO);
+ if (PREDICT_FALSE (
+ !u64x8_is_all_zero (or_flags_vec512 & flags_mask_vec512)))
+#else
if (PREDICT_FALSE (or_flags &
(VLIB_BUFFER_NEXT_PRESENT | VNET_BUFFER_F_OFFLOAD |
VNET_BUFFER_F_GSO)))
+#endif
goto one_by_one;
+#if defined CLIB_HAVE_VEC512
+ vlib_buffer_copy_indices (tb, buffers, 8);
+ avf_tx_fill_data_desc (vm, d + 0, b[0], cmd_eop, use_va_dma);
+ avf_tx_fill_data_desc (vm, d + 1, b[1], cmd_eop, use_va_dma);
+ avf_tx_fill_data_desc (vm, d + 2, b[2], cmd_eop, use_va_dma);
+ avf_tx_fill_data_desc (vm, d + 3, b[3], cmd_eop, use_va_dma);
+ avf_tx_fill_data_desc (vm, d + 4, b[4], cmd_eop, use_va_dma);
+ avf_tx_fill_data_desc (vm, d + 5, b[5], cmd_eop, use_va_dma);
+ avf_tx_fill_data_desc (vm, d + 6, b[6], cmd_eop, use_va_dma);
+ avf_tx_fill_data_desc (vm, d + 7, b[7], cmd_eop, use_va_dma);
+
+ buffers += 8;
+ n_packets_left -= 8;
+ n_desc_left -= 8;
+ d += 8;
+ tb += 8;
+#else
vlib_buffer_copy_indices (tb, buffers, 4);
avf_tx_fill_data_desc (vm, d + 0, b[0], cmd_eop, use_va_dma);
@@ -275,6 +323,8 @@
n_desc_left -= 4;
d += 4;
tb += 4;
+#endif
+
continue;
one_by_one:
diff --git a/src/vppinfra/vector_avx512.h b/src/vppinfra/vector_avx512.h
index 96c78e4..eda65ca 100644
--- a/src/vppinfra/vector_avx512.h
+++ b/src/vppinfra/vector_avx512.h
@@ -99,6 +99,9 @@
return (u32) _mm512_movepi16_mask ((__m512i) v);
}
+#define u64x8_i64gather(index, base, scale) \
+ (u64x8) _mm512_i64gather_epi64 ((__m512i) index, base, scale)
+
/* 512-bit packs */
#define _(f, t, fn) \
always_inline t t##_pack (f lo, f hi) \