Damjan Marion | 4c53ff4 | 2021-10-28 23:03:04 +0200 | [diff] [blame] | 1 | /* SPDX-License-Identifier: Apache-2.0 |
| 2 | * Copyright(c) 2021 Cisco Systems, Inc. |
| 3 | */ |
| 4 | |
| 5 | #ifndef included_vector_index_to_ptr_h |
| 6 | #define included_vector_index_to_ptr_h |
| 7 | #include <vppinfra/clib.h> |
| 8 | |
| 9 | #ifdef CLIB_HAVE_VEC128 |
| 10 | static_always_inline void |
| 11 | clib_index_to_ptr_u32x4 (u32 *indices, void **ptrs, i32 i, u64x2 ov, u8 shift) |
| 12 | { |
| 13 | u32x4 iv4 = u32x4_load_unaligned (indices + i); |
| 14 | u64x2 pv2; |
| 15 | pv2 = u64x2_from_u32x4 (iv4); |
| 16 | u64x2_store_unaligned ((pv2 << shift) + ov, ptrs + i); |
| 17 | #ifdef __aarch64__ |
| 18 | pv2 = u64x2_from_u32x4_high (iv4); |
| 19 | #else |
| 20 | pv2 = u64x2_from_u32x4 ((u32x4) u8x16_word_shift_right (iv4, 8)); |
| 21 | #endif |
| 22 | u64x2_store_unaligned ((pv2 << shift) + ov, ptrs + i + 2); |
| 23 | } |
| 24 | #endif |
| 25 | |
| 26 | /** \brief Convert array of indices to pointers with base and shift |
| 27 | |
| 28 | @param indices source array of u32 indices |
| 29 | @param base base pointer |
| 30 | @param shift numbers of bits to be shifted |
| 31 | @param ptrs destinatin array of pointers |
| 32 | @param n_elts number of elements in the source array |
| 33 | */ |
| 34 | |
| 35 | static_always_inline void |
| 36 | clib_index_to_ptr_u32 (u32 *indices, void *base, u8 shift, void **ptrs, |
| 37 | u32 n_elts) |
| 38 | { |
| 39 | #if defined CLIB_HAVE_VEC512 |
| 40 | if (n_elts >= 8) |
| 41 | { |
| 42 | u64x8 off = u64x8_splat ((u64) base); |
| 43 | u64x8 b0, b1, b2, b3, b4, b5, b6, b7; |
| 44 | |
| 45 | while (n_elts >= 64) |
| 46 | { |
| 47 | b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices)); |
| 48 | b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8)); |
| 49 | b2 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 16)); |
| 50 | b3 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 24)); |
| 51 | b4 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 32)); |
| 52 | b5 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 40)); |
| 53 | b6 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 48)); |
| 54 | b7 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 56)); |
| 55 | u64x8_store_unaligned ((b0 << shift) + off, ptrs); |
| 56 | u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8); |
| 57 | u64x8_store_unaligned ((b2 << shift) + off, ptrs + 16); |
| 58 | u64x8_store_unaligned ((b3 << shift) + off, ptrs + 24); |
| 59 | u64x8_store_unaligned ((b4 << shift) + off, ptrs + 32); |
| 60 | u64x8_store_unaligned ((b5 << shift) + off, ptrs + 40); |
| 61 | u64x8_store_unaligned ((b6 << shift) + off, ptrs + 48); |
| 62 | u64x8_store_unaligned ((b7 << shift) + off, ptrs + 56); |
| 63 | ptrs += 64; |
| 64 | indices += 64; |
| 65 | n_elts -= 64; |
| 66 | } |
| 67 | |
| 68 | if (n_elts == 0) |
| 69 | return; |
| 70 | |
| 71 | if (n_elts >= 32) |
| 72 | { |
| 73 | b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices)); |
| 74 | b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8)); |
| 75 | b2 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 16)); |
| 76 | b3 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 24)); |
| 77 | u64x8_store_unaligned ((b0 << shift) + off, ptrs); |
| 78 | u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8); |
| 79 | u64x8_store_unaligned ((b2 << shift) + off, ptrs + 16); |
| 80 | u64x8_store_unaligned ((b3 << shift) + off, ptrs + 24); |
| 81 | ptrs += 32; |
| 82 | indices += 32; |
| 83 | n_elts -= 32; |
| 84 | } |
| 85 | if (n_elts >= 16) |
| 86 | { |
| 87 | b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices)); |
| 88 | b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8)); |
| 89 | u64x8_store_unaligned ((b0 << shift) + off, ptrs); |
| 90 | u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8); |
| 91 | ptrs += 16; |
| 92 | indices += 16; |
| 93 | n_elts -= 16; |
| 94 | } |
Leyi Rong | 767a9ab | 2023-03-08 13:34:56 +0800 | [diff] [blame] | 95 | if (n_elts >= 8) |
Damjan Marion | 4c53ff4 | 2021-10-28 23:03:04 +0200 | [diff] [blame] | 96 | { |
| 97 | b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices)); |
| 98 | u64x8_store_unaligned ((b0 << shift) + off, ptrs); |
| 99 | ptrs += 8; |
| 100 | indices += 8; |
| 101 | n_elts -= 8; |
| 102 | } |
| 103 | |
Leyi Rong | 767a9ab | 2023-03-08 13:34:56 +0800 | [diff] [blame] | 104 | if (n_elts == 0) |
| 105 | return; |
| 106 | |
Damjan Marion | 4c53ff4 | 2021-10-28 23:03:04 +0200 | [diff] [blame] | 107 | b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + n_elts - 8)); |
| 108 | u64x8_store_unaligned ((b0 << shift) + off, ptrs + n_elts - 8); |
| 109 | } |
| 110 | else |
| 111 | { |
| 112 | u32 mask = pow2_mask (n_elts); |
| 113 | u64x8 r = u64x8_from_u32x8 (u32x8_mask_load_zero (indices, mask)); |
| 114 | u64x8_mask_store ((r << shift) + u64x8_splat ((u64) base), ptrs, mask); |
| 115 | return; |
| 116 | } |
| 117 | #elif defined CLIB_HAVE_VEC256 |
| 118 | if (n_elts >= 4) |
| 119 | { |
| 120 | u64x4 off = u64x4_splat ((u64) base); |
| 121 | u64x4 b0, b1, b2, b3, b4, b5, b6, b7; |
| 122 | |
| 123 | while (n_elts >= 32) |
| 124 | { |
| 125 | b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices)); |
| 126 | b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4)); |
| 127 | b2 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 8)); |
| 128 | b3 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 12)); |
| 129 | b4 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 16)); |
| 130 | b5 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 20)); |
| 131 | b6 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 24)); |
| 132 | b7 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 28)); |
| 133 | u64x4_store_unaligned ((b0 << shift) + off, ptrs); |
| 134 | u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4); |
| 135 | u64x4_store_unaligned ((b2 << shift) + off, ptrs + 8); |
| 136 | u64x4_store_unaligned ((b3 << shift) + off, ptrs + 12); |
| 137 | u64x4_store_unaligned ((b4 << shift) + off, ptrs + 16); |
| 138 | u64x4_store_unaligned ((b5 << shift) + off, ptrs + 20); |
| 139 | u64x4_store_unaligned ((b6 << shift) + off, ptrs + 24); |
| 140 | u64x4_store_unaligned ((b7 << shift) + off, ptrs + 28); |
| 141 | ptrs += 32; |
| 142 | indices += 32; |
| 143 | n_elts -= 32; |
| 144 | } |
| 145 | |
| 146 | if (n_elts == 0) |
| 147 | return; |
| 148 | |
| 149 | if (n_elts >= 16) |
| 150 | { |
| 151 | b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices)); |
| 152 | b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4)); |
| 153 | b2 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 8)); |
| 154 | b3 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 12)); |
| 155 | u64x4_store_unaligned ((b0 << shift) + off, ptrs); |
| 156 | u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4); |
| 157 | u64x4_store_unaligned ((b2 << shift) + off, ptrs + 8); |
| 158 | u64x4_store_unaligned ((b3 << shift) + off, ptrs + 12); |
| 159 | ptrs += 16; |
| 160 | indices += 16; |
| 161 | n_elts -= 16; |
| 162 | } |
| 163 | if (n_elts >= 8) |
| 164 | { |
| 165 | b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices)); |
| 166 | b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4)); |
| 167 | u64x4_store_unaligned ((b0 << shift) + off, ptrs); |
| 168 | u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4); |
| 169 | ptrs += 8; |
| 170 | indices += 8; |
| 171 | n_elts -= 8; |
| 172 | } |
| 173 | if (n_elts > 4) |
| 174 | { |
| 175 | b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices)); |
| 176 | u64x4_store_unaligned ((b0 << shift) + off, ptrs); |
| 177 | ptrs += 4; |
| 178 | indices += 4; |
| 179 | n_elts -= 4; |
| 180 | } |
| 181 | |
| 182 | b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + n_elts - 4)); |
| 183 | u64x4_store_unaligned ((b0 << shift) + off, ptrs + n_elts - 4); |
| 184 | return; |
| 185 | } |
| 186 | #ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE |
| 187 | else |
| 188 | { |
| 189 | u32 mask = pow2_mask (n_elts); |
| 190 | u64x4 r = u64x4_from_u32x4 (u32x4_mask_load_zero (indices, mask)); |
| 191 | u64x4_mask_store ((r << shift) + u64x4_splat ((u64) base), ptrs, mask); |
| 192 | return; |
| 193 | } |
| 194 | #endif |
| 195 | #elif defined(CLIB_HAVE_VEC128) |
| 196 | if (n_elts >= 4) |
| 197 | { |
| 198 | u64x2 ov = u64x2_splat ((u64) base); |
| 199 | u32 *i = (u32 *) indices; |
| 200 | void **p = (void **) ptrs; |
| 201 | u32 n = n_elts; |
| 202 | |
| 203 | while (n >= 32) |
| 204 | { |
| 205 | clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift); |
| 206 | clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift); |
| 207 | clib_index_to_ptr_u32x4 (indices, ptrs, 8, ov, shift); |
| 208 | clib_index_to_ptr_u32x4 (indices, ptrs, 12, ov, shift); |
| 209 | clib_index_to_ptr_u32x4 (indices, ptrs, 16, ov, shift); |
| 210 | clib_index_to_ptr_u32x4 (indices, ptrs, 20, ov, shift); |
| 211 | clib_index_to_ptr_u32x4 (indices, ptrs, 24, ov, shift); |
| 212 | clib_index_to_ptr_u32x4 (indices, ptrs, 28, ov, shift); |
| 213 | indices += 32; |
| 214 | ptrs += 32; |
| 215 | n -= 32; |
| 216 | } |
| 217 | |
| 218 | if (n == 0) |
| 219 | return; |
| 220 | |
| 221 | if (n >= 16) |
| 222 | { |
| 223 | clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift); |
| 224 | clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift); |
| 225 | clib_index_to_ptr_u32x4 (indices, ptrs, 8, ov, shift); |
| 226 | clib_index_to_ptr_u32x4 (indices, ptrs, 12, ov, shift); |
| 227 | indices += 16; |
| 228 | ptrs += 16; |
| 229 | n -= 16; |
| 230 | } |
| 231 | |
| 232 | if (n >= 8) |
| 233 | { |
| 234 | clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift); |
| 235 | clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift); |
| 236 | indices += 8; |
| 237 | ptrs += 8; |
| 238 | n -= 8; |
| 239 | } |
| 240 | |
| 241 | if (n > 4) |
| 242 | clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift); |
| 243 | |
| 244 | clib_index_to_ptr_u32x4 (i, p, n_elts - 4, ov, shift); |
| 245 | return; |
| 246 | } |
| 247 | #endif |
| 248 | while (n_elts) |
| 249 | { |
| 250 | ptrs[0] = base + ((u64) indices[0] << shift); |
| 251 | ptrs += 1; |
| 252 | indices += 1; |
| 253 | n_elts -= 1; |
| 254 | } |
| 255 | } |
| 256 | |
| 257 | #endif |