blob: 3985b757d544046a13da289e852aec5ca76b4fe6 [file] [log] [blame]
Damjan Marion4c53ff42021-10-28 23:03:04 +02001/* SPDX-License-Identifier: Apache-2.0
2 * Copyright(c) 2021 Cisco Systems, Inc.
3 */
4
5#ifndef included_vector_index_to_ptr_h
6#define included_vector_index_to_ptr_h
7#include <vppinfra/clib.h>
8
9#ifdef CLIB_HAVE_VEC128
10static_always_inline void
11clib_index_to_ptr_u32x4 (u32 *indices, void **ptrs, i32 i, u64x2 ov, u8 shift)
12{
13 u32x4 iv4 = u32x4_load_unaligned (indices + i);
14 u64x2 pv2;
15 pv2 = u64x2_from_u32x4 (iv4);
16 u64x2_store_unaligned ((pv2 << shift) + ov, ptrs + i);
17#ifdef __aarch64__
18 pv2 = u64x2_from_u32x4_high (iv4);
19#else
20 pv2 = u64x2_from_u32x4 ((u32x4) u8x16_word_shift_right (iv4, 8));
21#endif
22 u64x2_store_unaligned ((pv2 << shift) + ov, ptrs + i + 2);
23}
24#endif
25
26/** \brief Convert array of indices to pointers with base and shift
27
28 @param indices source array of u32 indices
29 @param base base pointer
30 @param shift numbers of bits to be shifted
31 @param ptrs destinatin array of pointers
32 @param n_elts number of elements in the source array
33*/
34
35static_always_inline void
36clib_index_to_ptr_u32 (u32 *indices, void *base, u8 shift, void **ptrs,
37 u32 n_elts)
38{
39#if defined CLIB_HAVE_VEC512
40 if (n_elts >= 8)
41 {
42 u64x8 off = u64x8_splat ((u64) base);
43 u64x8 b0, b1, b2, b3, b4, b5, b6, b7;
44
45 while (n_elts >= 64)
46 {
47 b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
48 b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8));
49 b2 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 16));
50 b3 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 24));
51 b4 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 32));
52 b5 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 40));
53 b6 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 48));
54 b7 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 56));
55 u64x8_store_unaligned ((b0 << shift) + off, ptrs);
56 u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8);
57 u64x8_store_unaligned ((b2 << shift) + off, ptrs + 16);
58 u64x8_store_unaligned ((b3 << shift) + off, ptrs + 24);
59 u64x8_store_unaligned ((b4 << shift) + off, ptrs + 32);
60 u64x8_store_unaligned ((b5 << shift) + off, ptrs + 40);
61 u64x8_store_unaligned ((b6 << shift) + off, ptrs + 48);
62 u64x8_store_unaligned ((b7 << shift) + off, ptrs + 56);
63 ptrs += 64;
64 indices += 64;
65 n_elts -= 64;
66 }
67
68 if (n_elts == 0)
69 return;
70
71 if (n_elts >= 32)
72 {
73 b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
74 b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8));
75 b2 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 16));
76 b3 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 24));
77 u64x8_store_unaligned ((b0 << shift) + off, ptrs);
78 u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8);
79 u64x8_store_unaligned ((b2 << shift) + off, ptrs + 16);
80 u64x8_store_unaligned ((b3 << shift) + off, ptrs + 24);
81 ptrs += 32;
82 indices += 32;
83 n_elts -= 32;
84 }
85 if (n_elts >= 16)
86 {
87 b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
88 b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8));
89 u64x8_store_unaligned ((b0 << shift) + off, ptrs);
90 u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8);
91 ptrs += 16;
92 indices += 16;
93 n_elts -= 16;
94 }
Leyi Rong767a9ab2023-03-08 13:34:56 +080095 if (n_elts >= 8)
Damjan Marion4c53ff42021-10-28 23:03:04 +020096 {
97 b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
98 u64x8_store_unaligned ((b0 << shift) + off, ptrs);
99 ptrs += 8;
100 indices += 8;
101 n_elts -= 8;
102 }
103
Leyi Rong767a9ab2023-03-08 13:34:56 +0800104 if (n_elts == 0)
105 return;
106
Damjan Marion4c53ff42021-10-28 23:03:04 +0200107 b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + n_elts - 8));
108 u64x8_store_unaligned ((b0 << shift) + off, ptrs + n_elts - 8);
109 }
110 else
111 {
112 u32 mask = pow2_mask (n_elts);
113 u64x8 r = u64x8_from_u32x8 (u32x8_mask_load_zero (indices, mask));
114 u64x8_mask_store ((r << shift) + u64x8_splat ((u64) base), ptrs, mask);
115 return;
116 }
117#elif defined CLIB_HAVE_VEC256
118 if (n_elts >= 4)
119 {
120 u64x4 off = u64x4_splat ((u64) base);
121 u64x4 b0, b1, b2, b3, b4, b5, b6, b7;
122
123 while (n_elts >= 32)
124 {
125 b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
126 b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4));
127 b2 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 8));
128 b3 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 12));
129 b4 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 16));
130 b5 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 20));
131 b6 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 24));
132 b7 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 28));
133 u64x4_store_unaligned ((b0 << shift) + off, ptrs);
134 u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4);
135 u64x4_store_unaligned ((b2 << shift) + off, ptrs + 8);
136 u64x4_store_unaligned ((b3 << shift) + off, ptrs + 12);
137 u64x4_store_unaligned ((b4 << shift) + off, ptrs + 16);
138 u64x4_store_unaligned ((b5 << shift) + off, ptrs + 20);
139 u64x4_store_unaligned ((b6 << shift) + off, ptrs + 24);
140 u64x4_store_unaligned ((b7 << shift) + off, ptrs + 28);
141 ptrs += 32;
142 indices += 32;
143 n_elts -= 32;
144 }
145
146 if (n_elts == 0)
147 return;
148
149 if (n_elts >= 16)
150 {
151 b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
152 b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4));
153 b2 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 8));
154 b3 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 12));
155 u64x4_store_unaligned ((b0 << shift) + off, ptrs);
156 u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4);
157 u64x4_store_unaligned ((b2 << shift) + off, ptrs + 8);
158 u64x4_store_unaligned ((b3 << shift) + off, ptrs + 12);
159 ptrs += 16;
160 indices += 16;
161 n_elts -= 16;
162 }
163 if (n_elts >= 8)
164 {
165 b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
166 b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4));
167 u64x4_store_unaligned ((b0 << shift) + off, ptrs);
168 u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4);
169 ptrs += 8;
170 indices += 8;
171 n_elts -= 8;
172 }
173 if (n_elts > 4)
174 {
175 b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
176 u64x4_store_unaligned ((b0 << shift) + off, ptrs);
177 ptrs += 4;
178 indices += 4;
179 n_elts -= 4;
180 }
181
182 b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + n_elts - 4));
183 u64x4_store_unaligned ((b0 << shift) + off, ptrs + n_elts - 4);
184 return;
185 }
186#ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE
187 else
188 {
189 u32 mask = pow2_mask (n_elts);
190 u64x4 r = u64x4_from_u32x4 (u32x4_mask_load_zero (indices, mask));
191 u64x4_mask_store ((r << shift) + u64x4_splat ((u64) base), ptrs, mask);
192 return;
193 }
194#endif
195#elif defined(CLIB_HAVE_VEC128)
196 if (n_elts >= 4)
197 {
198 u64x2 ov = u64x2_splat ((u64) base);
199 u32 *i = (u32 *) indices;
200 void **p = (void **) ptrs;
201 u32 n = n_elts;
202
203 while (n >= 32)
204 {
205 clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
206 clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift);
207 clib_index_to_ptr_u32x4 (indices, ptrs, 8, ov, shift);
208 clib_index_to_ptr_u32x4 (indices, ptrs, 12, ov, shift);
209 clib_index_to_ptr_u32x4 (indices, ptrs, 16, ov, shift);
210 clib_index_to_ptr_u32x4 (indices, ptrs, 20, ov, shift);
211 clib_index_to_ptr_u32x4 (indices, ptrs, 24, ov, shift);
212 clib_index_to_ptr_u32x4 (indices, ptrs, 28, ov, shift);
213 indices += 32;
214 ptrs += 32;
215 n -= 32;
216 }
217
218 if (n == 0)
219 return;
220
221 if (n >= 16)
222 {
223 clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
224 clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift);
225 clib_index_to_ptr_u32x4 (indices, ptrs, 8, ov, shift);
226 clib_index_to_ptr_u32x4 (indices, ptrs, 12, ov, shift);
227 indices += 16;
228 ptrs += 16;
229 n -= 16;
230 }
231
232 if (n >= 8)
233 {
234 clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
235 clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift);
236 indices += 8;
237 ptrs += 8;
238 n -= 8;
239 }
240
241 if (n > 4)
242 clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
243
244 clib_index_to_ptr_u32x4 (i, p, n_elts - 4, ov, shift);
245 return;
246 }
247#endif
248 while (n_elts)
249 {
250 ptrs[0] = base + ((u64) indices[0] << shift);
251 ptrs += 1;
252 indices += 1;
253 n_elts -= 1;
254 }
255}
256
257#endif