blob: 91de35464393c2e16b9c024550ca2a9f3eac6bd5 [file] [log] [blame]
/* SPDX-License-Identifier: Apache-2.0
* Copyright(c) 2021 Cisco Systems, Inc.
*/
#ifndef included_vector_index_to_ptr_h
#define included_vector_index_to_ptr_h
#include <vppinfra/clib.h>
#ifdef CLIB_HAVE_VEC128
static_always_inline void
clib_index_to_ptr_u32x4 (u32 *indices, void **ptrs, i32 i, u64x2 ov, u8 shift)
{
u32x4 iv4 = u32x4_load_unaligned (indices + i);
u64x2 pv2;
pv2 = u64x2_from_u32x4 (iv4);
u64x2_store_unaligned ((pv2 << shift) + ov, ptrs + i);
#ifdef __aarch64__
pv2 = u64x2_from_u32x4_high (iv4);
#else
pv2 = u64x2_from_u32x4 ((u32x4) u8x16_word_shift_right (iv4, 8));
#endif
u64x2_store_unaligned ((pv2 << shift) + ov, ptrs + i + 2);
}
#endif
/** \brief Convert array of indices to pointers with base and shift
@param indices source array of u32 indices
@param base base pointer
@param shift numbers of bits to be shifted
@param ptrs destinatin array of pointers
@param n_elts number of elements in the source array
*/
static_always_inline void
clib_index_to_ptr_u32 (u32 *indices, void *base, u8 shift, void **ptrs,
u32 n_elts)
{
#if defined CLIB_HAVE_VEC512
if (n_elts >= 8)
{
u64x8 off = u64x8_splat ((u64) base);
u64x8 b0, b1, b2, b3, b4, b5, b6, b7;
while (n_elts >= 64)
{
b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8));
b2 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 16));
b3 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 24));
b4 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 32));
b5 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 40));
b6 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 48));
b7 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 56));
u64x8_store_unaligned ((b0 << shift) + off, ptrs);
u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8);
u64x8_store_unaligned ((b2 << shift) + off, ptrs + 16);
u64x8_store_unaligned ((b3 << shift) + off, ptrs + 24);
u64x8_store_unaligned ((b4 << shift) + off, ptrs + 32);
u64x8_store_unaligned ((b5 << shift) + off, ptrs + 40);
u64x8_store_unaligned ((b6 << shift) + off, ptrs + 48);
u64x8_store_unaligned ((b7 << shift) + off, ptrs + 56);
ptrs += 64;
indices += 64;
n_elts -= 64;
}
if (n_elts == 0)
return;
if (n_elts >= 32)
{
b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8));
b2 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 16));
b3 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 24));
u64x8_store_unaligned ((b0 << shift) + off, ptrs);
u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8);
u64x8_store_unaligned ((b2 << shift) + off, ptrs + 16);
u64x8_store_unaligned ((b3 << shift) + off, ptrs + 24);
ptrs += 32;
indices += 32;
n_elts -= 32;
}
if (n_elts >= 16)
{
b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8));
u64x8_store_unaligned ((b0 << shift) + off, ptrs);
u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8);
ptrs += 16;
indices += 16;
n_elts -= 16;
}
if (n_elts > 8)
{
b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
u64x8_store_unaligned ((b0 << shift) + off, ptrs);
ptrs += 8;
indices += 8;
n_elts -= 8;
}
b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + n_elts - 8));
u64x8_store_unaligned ((b0 << shift) + off, ptrs + n_elts - 8);
}
else
{
u32 mask = pow2_mask (n_elts);
u64x8 r = u64x8_from_u32x8 (u32x8_mask_load_zero (indices, mask));
u64x8_mask_store ((r << shift) + u64x8_splat ((u64) base), ptrs, mask);
return;
}
#elif defined CLIB_HAVE_VEC256
if (n_elts >= 4)
{
u64x4 off = u64x4_splat ((u64) base);
u64x4 b0, b1, b2, b3, b4, b5, b6, b7;
while (n_elts >= 32)
{
b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4));
b2 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 8));
b3 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 12));
b4 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 16));
b5 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 20));
b6 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 24));
b7 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 28));
u64x4_store_unaligned ((b0 << shift) + off, ptrs);
u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4);
u64x4_store_unaligned ((b2 << shift) + off, ptrs + 8);
u64x4_store_unaligned ((b3 << shift) + off, ptrs + 12);
u64x4_store_unaligned ((b4 << shift) + off, ptrs + 16);
u64x4_store_unaligned ((b5 << shift) + off, ptrs + 20);
u64x4_store_unaligned ((b6 << shift) + off, ptrs + 24);
u64x4_store_unaligned ((b7 << shift) + off, ptrs + 28);
ptrs += 32;
indices += 32;
n_elts -= 32;
}
if (n_elts == 0)
return;
if (n_elts >= 16)
{
b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4));
b2 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 8));
b3 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 12));
u64x4_store_unaligned ((b0 << shift) + off, ptrs);
u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4);
u64x4_store_unaligned ((b2 << shift) + off, ptrs + 8);
u64x4_store_unaligned ((b3 << shift) + off, ptrs + 12);
ptrs += 16;
indices += 16;
n_elts -= 16;
}
if (n_elts >= 8)
{
b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4));
u64x4_store_unaligned ((b0 << shift) + off, ptrs);
u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4);
ptrs += 8;
indices += 8;
n_elts -= 8;
}
if (n_elts > 4)
{
b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
u64x4_store_unaligned ((b0 << shift) + off, ptrs);
ptrs += 4;
indices += 4;
n_elts -= 4;
}
b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + n_elts - 4));
u64x4_store_unaligned ((b0 << shift) + off, ptrs + n_elts - 4);
return;
}
#ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE
else
{
u32 mask = pow2_mask (n_elts);
u64x4 r = u64x4_from_u32x4 (u32x4_mask_load_zero (indices, mask));
u64x4_mask_store ((r << shift) + u64x4_splat ((u64) base), ptrs, mask);
return;
}
#endif
#elif defined(CLIB_HAVE_VEC128)
if (n_elts >= 4)
{
u64x2 ov = u64x2_splat ((u64) base);
u32 *i = (u32 *) indices;
void **p = (void **) ptrs;
u32 n = n_elts;
while (n >= 32)
{
clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift);
clib_index_to_ptr_u32x4 (indices, ptrs, 8, ov, shift);
clib_index_to_ptr_u32x4 (indices, ptrs, 12, ov, shift);
clib_index_to_ptr_u32x4 (indices, ptrs, 16, ov, shift);
clib_index_to_ptr_u32x4 (indices, ptrs, 20, ov, shift);
clib_index_to_ptr_u32x4 (indices, ptrs, 24, ov, shift);
clib_index_to_ptr_u32x4 (indices, ptrs, 28, ov, shift);
indices += 32;
ptrs += 32;
n -= 32;
}
if (n == 0)
return;
if (n >= 16)
{
clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift);
clib_index_to_ptr_u32x4 (indices, ptrs, 8, ov, shift);
clib_index_to_ptr_u32x4 (indices, ptrs, 12, ov, shift);
indices += 16;
ptrs += 16;
n -= 16;
}
if (n >= 8)
{
clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift);
indices += 8;
ptrs += 8;
n -= 8;
}
if (n > 4)
clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
clib_index_to_ptr_u32x4 (i, p, n_elts - 4, ov, shift);
return;
}
#endif
while (n_elts)
{
ptrs[0] = base + ((u64) indices[0] << shift);
ptrs += 1;
indices += 1;
n_elts -= 1;
}
}
#endif