vppinfra: vectorized index to pointer function
Type: improvement
Change-Id: I05e1a8fa31761b113355123429d72da18881d4b0
Signed-off-by: Damjan Marion <damarion@cisco.com>
diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h
index 77964fd..30fe234 100644
--- a/src/vlib/buffer_funcs.h
+++ b/src/vlib/buffer_funcs.h
@@ -42,6 +42,7 @@
#include <vppinfra/hash.h>
#include <vppinfra/fifo.h>
+#include <vppinfra/vector/index_to_ptr.h>
#include <vlib/buffer.h>
#include <vlib/physmem_funcs.h>
#include <vlib/main.h>
@@ -201,102 +202,38 @@
@param offset - (i32) offset applied to each pointer
*/
static_always_inline void
-vlib_get_buffers_with_offset (vlib_main_t * vm, u32 * bi, void **b, int count,
+vlib_get_buffers_with_offset (vlib_main_t *vm, u32 *bi, void **b, u32 count,
i32 offset)
{
uword buffer_mem_start = vm->buffer_main->buffer_mem_start;
-#ifdef CLIB_HAVE_VEC512
- u64x8 of8 = u64x8_splat (buffer_mem_start + offset);
- u64x4 off = u64x8_extract_lo (of8);
- /* if count is not const, compiler will not unroll while loop
- se we maintain two-in-parallel variant */
- while (count >= 32)
+ void *base = (void *) (buffer_mem_start + offset);
+ int objsize = __builtin_object_size (b, 0);
+ const int sh = CLIB_LOG2_CACHE_LINE_BYTES;
+
+ if (COMPILE_TIME_CONST (count) == 0 && objsize >= 64 * sizeof (b[0]) &&
+ (objsize & ((8 * sizeof (b[0])) - 1)) == 0)
{
- u64x8 b0 = u64x8_from_u32x8 (u32x8_load_unaligned (bi));
- u64x8 b1 = u64x8_from_u32x8 (u32x8_load_unaligned (bi + 8));
- u64x8 b2 = u64x8_from_u32x8 (u32x8_load_unaligned (bi + 16));
- u64x8 b3 = u64x8_from_u32x8 (u32x8_load_unaligned (bi + 24));
- /* shift and add to get vlib_buffer_t pointer */
- u64x8_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b);
- u64x8_store_unaligned ((b1 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b + 8);
- u64x8_store_unaligned ((b2 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b + 16);
- u64x8_store_unaligned ((b3 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b + 24);
- b += 32;
- bi += 32;
- count -= 32;
+ u32 n = round_pow2 (count, 8);
+ ASSERT (objsize >= count);
+ CLIB_ASSUME (objsize >= count);
+ while (n >= 64)
+ {
+ clib_index_to_ptr_u32 (bi, base, sh, b, 64);
+ b += 64;
+ bi += 64;
+ n -= 64;
+ }
+
+ while (n)
+ {
+ clib_index_to_ptr_u32 (bi, base, sh, b, 8);
+ b += 8;
+ bi += 8;
+ n -= 8;
+ }
}
- while (count >= 8)
- {
- u64x8 b0 = u64x8_from_u32x8 (u32x8_load_unaligned (bi));
- /* shift and add to get vlib_buffer_t pointer */
- u64x8_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b);
- b += 8;
- bi += 8;
- count -= 8;
- }
-#elif defined CLIB_HAVE_VEC256
- u64x4 off = u64x4_splat (buffer_mem_start + offset);
- /* if count is not const, compiler will not unroll while loop
- se we maintain two-in-parallel variant */
- while (count >= 32)
- {
- u64x4 b0 = u64x4_from_u32x4 (u32x4_load_unaligned (bi));
- u64x4 b1 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 4));
- u64x4 b2 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 8));
- u64x4 b3 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 12));
- u64x4 b4 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 16));
- u64x4 b5 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 20));
- u64x4 b6 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 24));
- u64x4 b7 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 28));
- /* shift and add to get vlib_buffer_t pointer */
- u64x4_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b);
- u64x4_store_unaligned ((b1 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 4);
- u64x4_store_unaligned ((b2 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 8);
- u64x4_store_unaligned ((b3 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 12);
- u64x4_store_unaligned ((b4 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 16);
- u64x4_store_unaligned ((b5 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 20);
- u64x4_store_unaligned ((b6 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 24);
- u64x4_store_unaligned ((b7 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 28);
- b += 32;
- bi += 32;
- count -= 32;
- }
-#endif
- while (count >= 4)
- {
-#ifdef CLIB_HAVE_VEC256
- u64x4 b0 = u64x4_from_u32x4 (u32x4_load_unaligned (bi));
- /* shift and add to get vlib_buffer_t pointer */
- u64x4_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b);
-#elif defined (CLIB_HAVE_VEC128)
- u64x2 off = u64x2_splat (buffer_mem_start + offset);
- u32x4 bi4 = u32x4_load_unaligned (bi);
- u64x2 b0 = u64x2_from_u32x4 ((u32x4) bi4);
-#if defined (__aarch64__)
- u64x2 b1 = u64x2_from_u32x4_high ((u32x4) bi4);
-#else
- bi4 = u32x4_shuffle (bi4, 2, 3, 0, 1);
- u64x2 b1 = u64x2_from_u32x4 ((u32x4) bi4);
-#endif
- u64x2_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b);
- u64x2_store_unaligned ((b1 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 2);
-#else
- b[0] = vlib_buffer_ptr_from_index (buffer_mem_start, bi[0], offset);
- b[1] = vlib_buffer_ptr_from_index (buffer_mem_start, bi[1], offset);
- b[2] = vlib_buffer_ptr_from_index (buffer_mem_start, bi[2], offset);
- b[3] = vlib_buffer_ptr_from_index (buffer_mem_start, bi[3], offset);
-#endif
- b += 4;
- bi += 4;
- count -= 4;
- }
- while (count)
- {
- b[0] = vlib_buffer_ptr_from_index (buffer_mem_start, bi[0], offset);
- b += 1;
- bi += 1;
- count -= 1;
- }
+ else
+ clib_index_to_ptr_u32 (bi, base, sh, b, count);
}
/** \brief Translate array of buffer indices into buffer pointers
@@ -308,7 +245,7 @@
*/
static_always_inline void
-vlib_get_buffers (vlib_main_t * vm, u32 * bi, vlib_buffer_t ** b, int count)
+vlib_get_buffers (vlib_main_t *vm, u32 *bi, vlib_buffer_t **b, u32 count)
{
vlib_get_buffers_with_offset (vm, bi, (void **) b, count, 0);
}
diff --git a/src/vppinfra/CMakeLists.txt b/src/vppinfra/CMakeLists.txt
index 6900995..11d4a5d 100644
--- a/src/vppinfra/CMakeLists.txt
+++ b/src/vppinfra/CMakeLists.txt
@@ -195,6 +195,7 @@
vector/array_mask.h
vector/compress.h
vector/count_equal.h
+ vector/index_to_ptr.h
vector/mask_compare.h
vector.h
vector_neon.h
@@ -275,6 +276,7 @@
vector/test/array_mask.c
vector/test/compress.c
vector/test/count_equal.c
+ vector/test/index_to_ptr.c
vector/test/mask_compare.c
)
diff --git a/src/vppinfra/vector/index_to_ptr.h b/src/vppinfra/vector/index_to_ptr.h
new file mode 100644
index 0000000..91de354
--- /dev/null
+++ b/src/vppinfra/vector/index_to_ptr.h
@@ -0,0 +1,254 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Cisco Systems, Inc.
+ */
+
+#ifndef included_vector_index_to_ptr_h
+#define included_vector_index_to_ptr_h
+#include <vppinfra/clib.h>
+
+#ifdef CLIB_HAVE_VEC128
+static_always_inline void
+clib_index_to_ptr_u32x4 (u32 *indices, void **ptrs, i32 i, u64x2 ov, u8 shift)
+{
+ u32x4 iv4 = u32x4_load_unaligned (indices + i);
+ u64x2 pv2;
+ pv2 = u64x2_from_u32x4 (iv4);
+ u64x2_store_unaligned ((pv2 << shift) + ov, ptrs + i);
+#ifdef __aarch64__
+ pv2 = u64x2_from_u32x4_high (iv4);
+#else
+ pv2 = u64x2_from_u32x4 ((u32x4) u8x16_word_shift_right (iv4, 8));
+#endif
+ u64x2_store_unaligned ((pv2 << shift) + ov, ptrs + i + 2);
+}
+#endif
+
+/** \brief Convert array of indices to pointers with base and shift
+
+ @param indices source array of u32 indices
+ @param base base pointer
+ @param shift numbers of bits to be shifted
+ @param ptrs destinatin array of pointers
+ @param n_elts number of elements in the source array
+*/
+
+static_always_inline void
+clib_index_to_ptr_u32 (u32 *indices, void *base, u8 shift, void **ptrs,
+ u32 n_elts)
+{
+#if defined CLIB_HAVE_VEC512
+ if (n_elts >= 8)
+ {
+ u64x8 off = u64x8_splat ((u64) base);
+ u64x8 b0, b1, b2, b3, b4, b5, b6, b7;
+
+ while (n_elts >= 64)
+ {
+ b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
+ b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8));
+ b2 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 16));
+ b3 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 24));
+ b4 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 32));
+ b5 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 40));
+ b6 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 48));
+ b7 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 56));
+ u64x8_store_unaligned ((b0 << shift) + off, ptrs);
+ u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8);
+ u64x8_store_unaligned ((b2 << shift) + off, ptrs + 16);
+ u64x8_store_unaligned ((b3 << shift) + off, ptrs + 24);
+ u64x8_store_unaligned ((b4 << shift) + off, ptrs + 32);
+ u64x8_store_unaligned ((b5 << shift) + off, ptrs + 40);
+ u64x8_store_unaligned ((b6 << shift) + off, ptrs + 48);
+ u64x8_store_unaligned ((b7 << shift) + off, ptrs + 56);
+ ptrs += 64;
+ indices += 64;
+ n_elts -= 64;
+ }
+
+ if (n_elts == 0)
+ return;
+
+ if (n_elts >= 32)
+ {
+ b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
+ b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8));
+ b2 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 16));
+ b3 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 24));
+ u64x8_store_unaligned ((b0 << shift) + off, ptrs);
+ u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8);
+ u64x8_store_unaligned ((b2 << shift) + off, ptrs + 16);
+ u64x8_store_unaligned ((b3 << shift) + off, ptrs + 24);
+ ptrs += 32;
+ indices += 32;
+ n_elts -= 32;
+ }
+ if (n_elts >= 16)
+ {
+ b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
+ b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8));
+ u64x8_store_unaligned ((b0 << shift) + off, ptrs);
+ u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8);
+ ptrs += 16;
+ indices += 16;
+ n_elts -= 16;
+ }
+ if (n_elts > 8)
+ {
+ b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
+ u64x8_store_unaligned ((b0 << shift) + off, ptrs);
+ ptrs += 8;
+ indices += 8;
+ n_elts -= 8;
+ }
+
+ b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + n_elts - 8));
+ u64x8_store_unaligned ((b0 << shift) + off, ptrs + n_elts - 8);
+ }
+ else
+ {
+ u32 mask = pow2_mask (n_elts);
+ u64x8 r = u64x8_from_u32x8 (u32x8_mask_load_zero (indices, mask));
+ u64x8_mask_store ((r << shift) + u64x8_splat ((u64) base), ptrs, mask);
+ return;
+ }
+#elif defined CLIB_HAVE_VEC256
+ if (n_elts >= 4)
+ {
+ u64x4 off = u64x4_splat ((u64) base);
+ u64x4 b0, b1, b2, b3, b4, b5, b6, b7;
+
+ while (n_elts >= 32)
+ {
+ b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
+ b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4));
+ b2 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 8));
+ b3 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 12));
+ b4 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 16));
+ b5 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 20));
+ b6 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 24));
+ b7 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 28));
+ u64x4_store_unaligned ((b0 << shift) + off, ptrs);
+ u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4);
+ u64x4_store_unaligned ((b2 << shift) + off, ptrs + 8);
+ u64x4_store_unaligned ((b3 << shift) + off, ptrs + 12);
+ u64x4_store_unaligned ((b4 << shift) + off, ptrs + 16);
+ u64x4_store_unaligned ((b5 << shift) + off, ptrs + 20);
+ u64x4_store_unaligned ((b6 << shift) + off, ptrs + 24);
+ u64x4_store_unaligned ((b7 << shift) + off, ptrs + 28);
+ ptrs += 32;
+ indices += 32;
+ n_elts -= 32;
+ }
+
+ if (n_elts == 0)
+ return;
+
+ if (n_elts >= 16)
+ {
+ b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
+ b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4));
+ b2 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 8));
+ b3 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 12));
+ u64x4_store_unaligned ((b0 << shift) + off, ptrs);
+ u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4);
+ u64x4_store_unaligned ((b2 << shift) + off, ptrs + 8);
+ u64x4_store_unaligned ((b3 << shift) + off, ptrs + 12);
+ ptrs += 16;
+ indices += 16;
+ n_elts -= 16;
+ }
+ if (n_elts >= 8)
+ {
+ b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
+ b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4));
+ u64x4_store_unaligned ((b0 << shift) + off, ptrs);
+ u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4);
+ ptrs += 8;
+ indices += 8;
+ n_elts -= 8;
+ }
+ if (n_elts > 4)
+ {
+ b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
+ u64x4_store_unaligned ((b0 << shift) + off, ptrs);
+ ptrs += 4;
+ indices += 4;
+ n_elts -= 4;
+ }
+
+ b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + n_elts - 4));
+ u64x4_store_unaligned ((b0 << shift) + off, ptrs + n_elts - 4);
+ return;
+ }
+#ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE
+ else
+ {
+ u32 mask = pow2_mask (n_elts);
+ u64x4 r = u64x4_from_u32x4 (u32x4_mask_load_zero (indices, mask));
+ u64x4_mask_store ((r << shift) + u64x4_splat ((u64) base), ptrs, mask);
+ return;
+ }
+#endif
+#elif defined(CLIB_HAVE_VEC128)
+ if (n_elts >= 4)
+ {
+ u64x2 ov = u64x2_splat ((u64) base);
+ u32 *i = (u32 *) indices;
+ void **p = (void **) ptrs;
+ u32 n = n_elts;
+
+ while (n >= 32)
+ {
+ clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
+ clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift);
+ clib_index_to_ptr_u32x4 (indices, ptrs, 8, ov, shift);
+ clib_index_to_ptr_u32x4 (indices, ptrs, 12, ov, shift);
+ clib_index_to_ptr_u32x4 (indices, ptrs, 16, ov, shift);
+ clib_index_to_ptr_u32x4 (indices, ptrs, 20, ov, shift);
+ clib_index_to_ptr_u32x4 (indices, ptrs, 24, ov, shift);
+ clib_index_to_ptr_u32x4 (indices, ptrs, 28, ov, shift);
+ indices += 32;
+ ptrs += 32;
+ n -= 32;
+ }
+
+ if (n == 0)
+ return;
+
+ if (n >= 16)
+ {
+ clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
+ clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift);
+ clib_index_to_ptr_u32x4 (indices, ptrs, 8, ov, shift);
+ clib_index_to_ptr_u32x4 (indices, ptrs, 12, ov, shift);
+ indices += 16;
+ ptrs += 16;
+ n -= 16;
+ }
+
+ if (n >= 8)
+ {
+ clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
+ clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift);
+ indices += 8;
+ ptrs += 8;
+ n -= 8;
+ }
+
+ if (n > 4)
+ clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
+
+ clib_index_to_ptr_u32x4 (i, p, n_elts - 4, ov, shift);
+ return;
+ }
+#endif
+ while (n_elts)
+ {
+ ptrs[0] = base + ((u64) indices[0] << shift);
+ ptrs += 1;
+ indices += 1;
+ n_elts -= 1;
+ }
+}
+
+#endif
diff --git a/src/vppinfra/vector/test/index_to_ptr.c b/src/vppinfra/vector/test/index_to_ptr.c
new file mode 100644
index 0000000..ae33020
--- /dev/null
+++ b/src/vppinfra/vector/test/index_to_ptr.c
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Cisco Systems, Inc.
+ */
+
+#include <vppinfra/format.h>
+#include <vppinfra/vector/test/test.h>
+#include <vppinfra/vector/index_to_ptr.h>
+
+typedef void (wrapper_fn) (u32 *indices, void *base, u8 shift, void **ptrs,
+ u32 n_elts);
+
+__clib_test_fn void
+clib_index_to_ptr_u32_wrapper (u32 *indices, void *base, u8 shift, void **ptrs,
+ u32 n_elts)
+{
+ clib_index_to_ptr_u32 (indices, base, shift, ptrs, n_elts);
+}
+
+static wrapper_fn *wfn = &clib_index_to_ptr_u32_wrapper;
+
+static clib_error_t *
+test_clib_index_to_ptr_u32 (clib_error_t *err)
+{
+ void *_ptrs[512 + 128], **ptrs = _ptrs + 64;
+ u32 _indices[512 + 128], *indices = _indices + 64;
+ u16 lengths[] = { 1, 3, 5, 7, 9, 15, 16, 17, 31, 32,
+ 33, 40, 41, 42, 63, 64, 65, 511, 512 };
+
+ for (int i = 0; i < ARRAY_LEN (_indices); i++)
+ _indices[i] = i;
+
+ for (int i = 0; i < ARRAY_LEN (lengths); i++)
+ {
+ u16 len = lengths[i];
+ u8 shift = 6;
+ void *base = (void *) 0x100000000 + i;
+
+ for (int j = -64; j < len + 64; j++)
+ ptrs[j] = (void *) 0xfefefefefefefefe;
+
+ wfn (indices, base, shift, ptrs, len);
+ for (int j = 0; j < len; j++)
+ {
+ void *expected = base + ((u64) indices[j] << shift);
+ if (ptrs[j] != expected)
+ return clib_error_return (err,
+ "testcase failed for length %u "
+ "(offset %u, expected %p, found %p)",
+ len, j, expected, ptrs[j]);
+ }
+ }
+ return err;
+}
+
+REGISTER_TEST (clib_index_to_ptr_u32) = {
+ .name = "clib_index_to_ptr_u32",
+ .fn = test_clib_index_to_ptr_u32,
+};