blob: 91de35464393c2e16b9c024550ca2a9f3eac6bd5 [file] [log] [blame]
Damjan Marion4c53ff42021-10-28 23:03:04 +02001/* SPDX-License-Identifier: Apache-2.0
2 * Copyright(c) 2021 Cisco Systems, Inc.
3 */
4
5#ifndef included_vector_index_to_ptr_h
6#define included_vector_index_to_ptr_h
7#include <vppinfra/clib.h>
8
9#ifdef CLIB_HAVE_VEC128
10static_always_inline void
11clib_index_to_ptr_u32x4 (u32 *indices, void **ptrs, i32 i, u64x2 ov, u8 shift)
12{
13 u32x4 iv4 = u32x4_load_unaligned (indices + i);
14 u64x2 pv2;
15 pv2 = u64x2_from_u32x4 (iv4);
16 u64x2_store_unaligned ((pv2 << shift) + ov, ptrs + i);
17#ifdef __aarch64__
18 pv2 = u64x2_from_u32x4_high (iv4);
19#else
20 pv2 = u64x2_from_u32x4 ((u32x4) u8x16_word_shift_right (iv4, 8));
21#endif
22 u64x2_store_unaligned ((pv2 << shift) + ov, ptrs + i + 2);
23}
24#endif
25
26/** \brief Convert array of indices to pointers with base and shift
27
28 @param indices source array of u32 indices
29 @param base base pointer
30 @param shift numbers of bits to be shifted
31 @param ptrs destinatin array of pointers
32 @param n_elts number of elements in the source array
33*/
34
35static_always_inline void
36clib_index_to_ptr_u32 (u32 *indices, void *base, u8 shift, void **ptrs,
37 u32 n_elts)
38{
39#if defined CLIB_HAVE_VEC512
40 if (n_elts >= 8)
41 {
42 u64x8 off = u64x8_splat ((u64) base);
43 u64x8 b0, b1, b2, b3, b4, b5, b6, b7;
44
45 while (n_elts >= 64)
46 {
47 b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
48 b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8));
49 b2 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 16));
50 b3 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 24));
51 b4 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 32));
52 b5 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 40));
53 b6 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 48));
54 b7 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 56));
55 u64x8_store_unaligned ((b0 << shift) + off, ptrs);
56 u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8);
57 u64x8_store_unaligned ((b2 << shift) + off, ptrs + 16);
58 u64x8_store_unaligned ((b3 << shift) + off, ptrs + 24);
59 u64x8_store_unaligned ((b4 << shift) + off, ptrs + 32);
60 u64x8_store_unaligned ((b5 << shift) + off, ptrs + 40);
61 u64x8_store_unaligned ((b6 << shift) + off, ptrs + 48);
62 u64x8_store_unaligned ((b7 << shift) + off, ptrs + 56);
63 ptrs += 64;
64 indices += 64;
65 n_elts -= 64;
66 }
67
68 if (n_elts == 0)
69 return;
70
71 if (n_elts >= 32)
72 {
73 b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
74 b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8));
75 b2 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 16));
76 b3 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 24));
77 u64x8_store_unaligned ((b0 << shift) + off, ptrs);
78 u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8);
79 u64x8_store_unaligned ((b2 << shift) + off, ptrs + 16);
80 u64x8_store_unaligned ((b3 << shift) + off, ptrs + 24);
81 ptrs += 32;
82 indices += 32;
83 n_elts -= 32;
84 }
85 if (n_elts >= 16)
86 {
87 b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
88 b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8));
89 u64x8_store_unaligned ((b0 << shift) + off, ptrs);
90 u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8);
91 ptrs += 16;
92 indices += 16;
93 n_elts -= 16;
94 }
95 if (n_elts > 8)
96 {
97 b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices));
98 u64x8_store_unaligned ((b0 << shift) + off, ptrs);
99 ptrs += 8;
100 indices += 8;
101 n_elts -= 8;
102 }
103
104 b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + n_elts - 8));
105 u64x8_store_unaligned ((b0 << shift) + off, ptrs + n_elts - 8);
106 }
107 else
108 {
109 u32 mask = pow2_mask (n_elts);
110 u64x8 r = u64x8_from_u32x8 (u32x8_mask_load_zero (indices, mask));
111 u64x8_mask_store ((r << shift) + u64x8_splat ((u64) base), ptrs, mask);
112 return;
113 }
114#elif defined CLIB_HAVE_VEC256
115 if (n_elts >= 4)
116 {
117 u64x4 off = u64x4_splat ((u64) base);
118 u64x4 b0, b1, b2, b3, b4, b5, b6, b7;
119
120 while (n_elts >= 32)
121 {
122 b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
123 b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4));
124 b2 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 8));
125 b3 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 12));
126 b4 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 16));
127 b5 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 20));
128 b6 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 24));
129 b7 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 28));
130 u64x4_store_unaligned ((b0 << shift) + off, ptrs);
131 u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4);
132 u64x4_store_unaligned ((b2 << shift) + off, ptrs + 8);
133 u64x4_store_unaligned ((b3 << shift) + off, ptrs + 12);
134 u64x4_store_unaligned ((b4 << shift) + off, ptrs + 16);
135 u64x4_store_unaligned ((b5 << shift) + off, ptrs + 20);
136 u64x4_store_unaligned ((b6 << shift) + off, ptrs + 24);
137 u64x4_store_unaligned ((b7 << shift) + off, ptrs + 28);
138 ptrs += 32;
139 indices += 32;
140 n_elts -= 32;
141 }
142
143 if (n_elts == 0)
144 return;
145
146 if (n_elts >= 16)
147 {
148 b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
149 b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4));
150 b2 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 8));
151 b3 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 12));
152 u64x4_store_unaligned ((b0 << shift) + off, ptrs);
153 u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4);
154 u64x4_store_unaligned ((b2 << shift) + off, ptrs + 8);
155 u64x4_store_unaligned ((b3 << shift) + off, ptrs + 12);
156 ptrs += 16;
157 indices += 16;
158 n_elts -= 16;
159 }
160 if (n_elts >= 8)
161 {
162 b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
163 b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4));
164 u64x4_store_unaligned ((b0 << shift) + off, ptrs);
165 u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4);
166 ptrs += 8;
167 indices += 8;
168 n_elts -= 8;
169 }
170 if (n_elts > 4)
171 {
172 b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices));
173 u64x4_store_unaligned ((b0 << shift) + off, ptrs);
174 ptrs += 4;
175 indices += 4;
176 n_elts -= 4;
177 }
178
179 b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + n_elts - 4));
180 u64x4_store_unaligned ((b0 << shift) + off, ptrs + n_elts - 4);
181 return;
182 }
183#ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE
184 else
185 {
186 u32 mask = pow2_mask (n_elts);
187 u64x4 r = u64x4_from_u32x4 (u32x4_mask_load_zero (indices, mask));
188 u64x4_mask_store ((r << shift) + u64x4_splat ((u64) base), ptrs, mask);
189 return;
190 }
191#endif
192#elif defined(CLIB_HAVE_VEC128)
193 if (n_elts >= 4)
194 {
195 u64x2 ov = u64x2_splat ((u64) base);
196 u32 *i = (u32 *) indices;
197 void **p = (void **) ptrs;
198 u32 n = n_elts;
199
200 while (n >= 32)
201 {
202 clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
203 clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift);
204 clib_index_to_ptr_u32x4 (indices, ptrs, 8, ov, shift);
205 clib_index_to_ptr_u32x4 (indices, ptrs, 12, ov, shift);
206 clib_index_to_ptr_u32x4 (indices, ptrs, 16, ov, shift);
207 clib_index_to_ptr_u32x4 (indices, ptrs, 20, ov, shift);
208 clib_index_to_ptr_u32x4 (indices, ptrs, 24, ov, shift);
209 clib_index_to_ptr_u32x4 (indices, ptrs, 28, ov, shift);
210 indices += 32;
211 ptrs += 32;
212 n -= 32;
213 }
214
215 if (n == 0)
216 return;
217
218 if (n >= 16)
219 {
220 clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
221 clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift);
222 clib_index_to_ptr_u32x4 (indices, ptrs, 8, ov, shift);
223 clib_index_to_ptr_u32x4 (indices, ptrs, 12, ov, shift);
224 indices += 16;
225 ptrs += 16;
226 n -= 16;
227 }
228
229 if (n >= 8)
230 {
231 clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
232 clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift);
233 indices += 8;
234 ptrs += 8;
235 n -= 8;
236 }
237
238 if (n > 4)
239 clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift);
240
241 clib_index_to_ptr_u32x4 (i, p, n_elts - 4, ov, shift);
242 return;
243 }
244#endif
245 while (n_elts)
246 {
247 ptrs[0] = base + ((u64) indices[0] << shift);
248 ptrs += 1;
249 indices += 1;
250 n_elts -= 1;
251 }
252}
253
254#endif