blob: c24ed728c3c1f1f158536d015854012240e0783b [file] [log] [blame]
Damjan Marionc5766222018-04-16 00:18:34 +02001/*
2 * Copyright (c) 2018 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#ifndef included_vector_avx2_h
17#define included_vector_avx2_h
18
19#include <vppinfra/clib.h>
20#include <x86intrin.h>
21
Damjan Mariona52e1662018-05-19 00:04:23 +020022/* *INDENT-OFF* */
Damjan Marionc5766222018-04-16 00:18:34 +020023#define foreach_avx2_vec256i \
Damjan Marion4fce7f72018-07-16 14:18:23 +020024 _(i,8,32,epi8) _(i,16,16,epi16) _(i,32,8,epi32) _(i,64,4,epi64)
Damjan Marionc5766222018-04-16 00:18:34 +020025#define foreach_avx2_vec256u \
Damjan Marion4fce7f72018-07-16 14:18:23 +020026 _(u,8,32,epi8) _(u,16,16,epi16) _(u,32,8,epi32) _(u,64,4,epi64)
Damjan Marionc5766222018-04-16 00:18:34 +020027#define foreach_avx2_vec256f \
28 _(f,32,8,ps) _(f,64,4,pd)
29
Damjan Marion4fce7f72018-07-16 14:18:23 +020030#define _mm256_set1_epi64 _mm256_set1_epi64x
31
Damjan Mariona52e1662018-05-19 00:04:23 +020032/* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
33 is_all_equal */
Damjan Marionc5766222018-04-16 00:18:34 +020034#define _(t, s, c, i) \
35static_always_inline t##s##x##c \
36t##s##x##c##_splat (t##s x) \
37{ return (t##s##x##c) _mm256_set1_##i (x); } \
38\
39static_always_inline t##s##x##c \
40t##s##x##c##_load_unaligned (void *p) \
41{ return (t##s##x##c) _mm256_loadu_si256 (p); } \
42\
43static_always_inline void \
44t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
45{ _mm256_storeu_si256 ((__m256i *) p, (__m256i) v); } \
46\
47static_always_inline int \
48t##s##x##c##_is_all_zero (t##s##x##c x) \
49{ return _mm256_testz_si256 ((__m256i) x, (__m256i) x); } \
50\
51static_always_inline int \
Damjan Marion14864772018-05-22 14:07:47 +020052t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
53{ return t##s##x##c##_is_all_zero (a ^ b); } \
Damjan Marionc5766222018-04-16 00:18:34 +020054\
Damjan Mariona52e1662018-05-19 00:04:23 +020055static_always_inline int \
56t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
Damjan Marion4fce7f72018-07-16 14:18:23 +020057{ return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); } \
58\
59static_always_inline t##s##x##c \
60t##s##x##c##_interleave_lo (t##s##x##c a, t##s##x##c b) \
61{ return (t##s##x##c) _mm256_unpacklo_##i ((__m256i) a, (__m256i) b); } \
62\
63static_always_inline t##s##x##c \
64t##s##x##c##_interleave_hi (t##s##x##c a, t##s##x##c b) \
65{ return (t##s##x##c) _mm256_unpackhi_##i ((__m256i) a, (__m256i) b); } \
66
Damjan Marionc5766222018-04-16 00:18:34 +020067
68foreach_avx2_vec256i foreach_avx2_vec256u
69#undef _
Damjan Mariona52e1662018-05-19 00:04:23 +020070/* *INDENT-ON* */
71
72always_inline u32x8
Damjan Marionc5766222018-04-16 00:18:34 +020073u32x8_permute (u32x8 v, u32x8 idx)
74{
75 return (u32x8) _mm256_permutevar8x32_epi32 ((__m256i) v, (__m256i) idx);
76}
77
Damjan Marion1cf9a162018-05-23 20:21:51 +020078/* _extract_lo, _extract_hi */
79/* *INDENT-OFF* */
80#define _(t1,t2) \
81always_inline t1 \
82t2##_extract_lo (t2 v) \
83{ return (t1) _mm256_extracti128_si256 ((__m256i) v, 0); } \
84\
85always_inline t1 \
86t2##_extract_hi (t2 v) \
87{ return (t1) _mm256_extracti128_si256 ((__m256i) v, 1); } \
88\
89always_inline t2 \
90t2##_insert_lo (t2 v1, t1 v2) \
91{ return (t2) _mm256_inserti128_si256 ((__m256i) v1, (__m128i) v2, 0); }\
92\
93always_inline t2 \
94t2##_insert_hi (t2 v1, t1 v2) \
95{ return (t2) _mm256_inserti128_si256 ((__m256i) v1, (__m128i) v2, 1); }\
Damjan Marionc5766222018-04-16 00:18:34 +020096
Damjan Marion1cf9a162018-05-23 20:21:51 +020097_(u8x16, u8x32)
98_(u16x8, u16x16)
99_(u32x4, u32x8)
100_(u64x2, u64x4)
101#undef _
102/* *INDENT-ON* */
Damjan Marionc5766222018-04-16 00:18:34 +0200103
Damjan Marionee7f0bd2018-05-05 12:30:28 +0200104
Damjan Marion1cf9a162018-05-23 20:21:51 +0200105
Damjan Marionee7f0bd2018-05-05 12:30:28 +0200106
Damjan Marion8c3f8a22018-05-17 21:12:13 +0200107static_always_inline u32
108u8x32_msb_mask (u8x32 v)
109{
110 return _mm256_movemask_epi8 ((__m256i) v);
111}
112
Damjan Marion90d05bc2020-08-31 17:18:26 +0200113/* _from_ */
Damjan Marionafe56de2018-05-17 12:44:00 +0200114/* *INDENT-OFF* */
115#define _(f,t,i) \
116static_always_inline t \
Damjan Marion90d05bc2020-08-31 17:18:26 +0200117t##_from_##f (f x) \
Damjan Marionafe56de2018-05-17 12:44:00 +0200118{ return (t) _mm256_cvt##i ((__m128i) x); }
119
120_(u16x8, u32x8, epu16_epi32)
121_(u16x8, u64x4, epu16_epi64)
122_(u32x4, u64x4, epu32_epi64)
Lijian.Zhang7e9d5ff2021-04-14 16:12:28 +0800123_ (u8x16, u16x16, epu8_epi16)
Damjan Marionafe56de2018-05-17 12:44:00 +0200124_(u8x16, u32x8, epu8_epi32)
125_(u8x16, u64x4, epu8_epi64)
126_(i16x8, i32x8, epi16_epi32)
127_(i16x8, i64x4, epi16_epi64)
128_(i32x4, i64x4, epi32_epi64)
Lijian.Zhang7e9d5ff2021-04-14 16:12:28 +0800129_ (i8x16, i16x16, epi8_epi16)
Damjan Marionafe56de2018-05-17 12:44:00 +0200130_(i8x16, i32x8, epi8_epi32)
131_(i8x16, i64x4, epi8_epi64)
132#undef _
133/* *INDENT-ON* */
134
Damjan Mariondd648aa2020-03-12 11:56:00 +0100135static_always_inline u64x4
136u64x4_byte_swap (u64x4 v)
137{
138 u8x32 swap = {
139 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
140 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
141 };
142 return (u64x4) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
143}
144
Damjan Marionc899dac2019-04-16 18:41:01 +0200145static_always_inline u32x8
146u32x8_byte_swap (u32x8 v)
147{
148 u8x32 swap = {
149 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
150 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
151 };
152 return (u32x8) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
153}
154
Damjan Marionbf129f42018-06-27 13:03:26 +0200155static_always_inline u16x16
156u16x16_byte_swap (u16x16 v)
157{
158 u8x32 swap = {
159 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
160 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
161 };
162 return (u16x16) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
163}
164
Damjan Marion94dbf952020-07-15 20:18:39 +0200165static_always_inline u8x32
166u8x32_shuffle (u8x32 v, u8x32 m)
167{
168 return (u8x32) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) m);
169}
170
171#define u8x32_align_right(a, b, imm) \
172 (u8x32) _mm256_alignr_epi8 ((__m256i) a, (__m256i) b, imm)
173
174static_always_inline u32
175u32x8_sum_elts (u32x8 sum8)
176{
177 sum8 += (u32x8) u8x32_align_right (sum8, sum8, 8);
178 sum8 += (u32x8) u8x32_align_right (sum8, sum8, 4);
179 return sum8[0] + sum8[4];
180}
181
Damjan Marionbf129f42018-06-27 13:03:26 +0200182static_always_inline u32x8
183u32x8_hadd (u32x8 v1, u32x8 v2)
184{
185 return (u32x8) _mm256_hadd_epi32 ((__m256i) v1, (__m256i) v2);
186}
187
Damjan Marion08bca802018-06-18 22:21:40 +0200188static_always_inline u16x16
189u16x16_mask_last (u16x16 v, u8 n_last)
190{
191 const u16x16 masks[17] = {
192 {0},
193 {-1},
194 {-1, -1},
195 {-1, -1, -1},
196 {-1, -1, -1, -1},
197 {-1, -1, -1, -1, -1},
198 {-1, -1, -1, -1, -1, -1},
199 {-1, -1, -1, -1, -1, -1, -1},
200 {-1, -1, -1, -1, -1, -1, -1, -1},
201 {-1, -1, -1, -1, -1, -1, -1, -1, -1},
202 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
203 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
204 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
205 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
206 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
207 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
208 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
209 };
210
211 ASSERT (n_last < 17);
212
213 return v & masks[16 - n_last];
214}
215
Damjan Marion69fdfee2018-10-06 14:33:18 +0200216static_always_inline f32x8
217f32x8_from_u32x8 (u32x8 v)
218{
219 return (f32x8) _mm256_cvtepi32_ps ((__m256i) v);
220}
221
222static_always_inline u32x8
223u32x8_from_f32x8 (f32x8 v)
224{
225 return (u32x8) _mm256_cvttps_epi32 ((__m256) v);
226}
227
Damjan Marionc899dac2019-04-16 18:41:01 +0200228#define u32x8_blend(a,b,m) \
229 (u32x8) _mm256_blend_epi32 ((__m256i) a, (__m256i) b, m)
230
Damjan Marion07243572018-11-20 10:06:57 +0100231#define u16x16_blend(v1, v2, mask) \
232 (u16x16) _mm256_blend_epi16 ((__m256i) (v1), (__m256i) (v2), mask)
233
234static_always_inline u64x4
235u64x4_gather (void *p0, void *p1, void *p2, void *p3)
236{
237 u64x4 r = {
238 *(u64 *) p0, *(u64 *) p1, *(u64 *) p2, *(u64 *) p3
239 };
240 return r;
241}
242
243static_always_inline u32x8
244u32x8_gather (void *p0, void *p1, void *p2, void *p3, void *p4, void *p5,
245 void *p6, void *p7)
246{
247 u32x8 r = {
248 *(u32 *) p0, *(u32 *) p1, *(u32 *) p2, *(u32 *) p3,
249 *(u32 *) p4, *(u32 *) p5, *(u32 *) p6, *(u32 *) p7,
250 };
251 return r;
252}
253
254
255static_always_inline void
256u64x4_scatter (u64x4 r, void *p0, void *p1, void *p2, void *p3)
257{
258 *(u64 *) p0 = r[0];
259 *(u64 *) p1 = r[1];
260 *(u64 *) p2 = r[2];
261 *(u64 *) p3 = r[3];
262}
263
264static_always_inline void
265u32x8_scatter (u32x8 r, void *p0, void *p1, void *p2, void *p3, void *p4,
266 void *p5, void *p6, void *p7)
267{
268 *(u32 *) p0 = r[0];
269 *(u32 *) p1 = r[1];
270 *(u32 *) p2 = r[2];
271 *(u32 *) p3 = r[3];
272 *(u32 *) p4 = r[4];
273 *(u32 *) p5 = r[5];
274 *(u32 *) p6 = r[6];
275 *(u32 *) p7 = r[7];
276}
277
278static_always_inline void
279u64x4_scatter_one (u64x4 r, int index, void *p)
280{
281 *(u64 *) p = r[index];
282}
283
284static_always_inline void
285u32x8_scatter_one (u32x8 r, int index, void *p)
286{
287 *(u32 *) p = r[index];
288}
289
Damjan Marionc59b9a22019-03-19 15:38:40 +0100290static_always_inline u8x32
291u8x32_is_greater (u8x32 v1, u8x32 v2)
292{
293 return (u8x32) _mm256_cmpgt_epi8 ((__m256i) v1, (__m256i) v2);
294}
295
296static_always_inline u8x32
297u8x32_blend (u8x32 v1, u8x32 v2, u8x32 mask)
298{
299 return (u8x32) _mm256_blendv_epi8 ((__m256i) v1, (__m256i) v2,
300 (__m256i) mask);
301}
302
Damjan Marion9f7e33d2019-04-08 10:14:51 +0200303#define u32x8_permute_lanes(a, b, m) \
304 (u32x8) _mm256_permute2x128_si256 ((__m256i) a, (__m256i) b, m)
305#define u64x4_permute_lanes(a, b, m) \
306 (u64x4) _mm256_permute2x128_si256 ((__m256i) a, (__m256i) b, m)
307
Damjan Marionc899dac2019-04-16 18:41:01 +0200308static_always_inline u32x8
309u32x8_min (u32x8 a, u32x8 b)
310{
311 return (u32x8) _mm256_min_epu32 ((__m256i) a, (__m256i) b);
312}
313
314static_always_inline u32
315u32x8_min_scalar (u32x8 v)
316{
317 return u32x4_min_scalar (u32x4_min (u32x8_extract_lo (v),
318 u32x8_extract_hi (v)));
319}
320
Damjan Marion9f7e33d2019-04-08 10:14:51 +0200321static_always_inline void
322u32x8_transpose (u32x8 a[8])
323{
324 u64x4 r[8], x, y;
325
326 r[0] = (u64x4) u32x8_interleave_lo (a[0], a[1]);
327 r[1] = (u64x4) u32x8_interleave_hi (a[0], a[1]);
328 r[2] = (u64x4) u32x8_interleave_lo (a[2], a[3]);
329 r[3] = (u64x4) u32x8_interleave_hi (a[2], a[3]);
330 r[4] = (u64x4) u32x8_interleave_lo (a[4], a[5]);
331 r[5] = (u64x4) u32x8_interleave_hi (a[4], a[5]);
332 r[6] = (u64x4) u32x8_interleave_lo (a[6], a[7]);
333 r[7] = (u64x4) u32x8_interleave_hi (a[6], a[7]);
334
335 x = u64x4_interleave_lo (r[0], r[2]);
336 y = u64x4_interleave_lo (r[4], r[6]);
337 a[0] = u32x8_permute_lanes (x, y, 0x20);
338 a[4] = u32x8_permute_lanes (x, y, 0x31);
339
340 x = u64x4_interleave_hi (r[0], r[2]);
341 y = u64x4_interleave_hi (r[4], r[6]);
342 a[1] = u32x8_permute_lanes (x, y, 0x20);
343 a[5] = u32x8_permute_lanes (x, y, 0x31);
344
345 x = u64x4_interleave_lo (r[1], r[3]);
346 y = u64x4_interleave_lo (r[5], r[7]);
347 a[2] = u32x8_permute_lanes (x, y, 0x20);
348 a[6] = u32x8_permute_lanes (x, y, 0x31);
349
350 x = u64x4_interleave_hi (r[1], r[3]);
351 y = u64x4_interleave_hi (r[5], r[7]);
352 a[3] = u32x8_permute_lanes (x, y, 0x20);
353 a[7] = u32x8_permute_lanes (x, y, 0x31);
354}
355
356static_always_inline void
357u64x4_transpose (u64x4 a[8])
358{
359 u64x4 r[4];
360
361 r[0] = u64x4_interleave_lo (a[0], a[1]);
362 r[1] = u64x4_interleave_hi (a[0], a[1]);
363 r[2] = u64x4_interleave_lo (a[2], a[3]);
364 r[3] = u64x4_interleave_hi (a[2], a[3]);
365
366 a[0] = u64x4_permute_lanes (r[0], r[2], 0x20);
367 a[1] = u64x4_permute_lanes (r[1], r[3], 0x20);
368 a[2] = u64x4_permute_lanes (r[0], r[2], 0x31);
369 a[3] = u64x4_permute_lanes (r[1], r[3], 0x31);
370}
371
Damjan Marionc5766222018-04-16 00:18:34 +0200372#endif /* included_vector_avx2_h */
373
374/*
375 * fd.io coding-style-patch-verification: ON
376 *
377 * Local Variables:
378 * eval: (c-set-style "gnu")
379 * End:
380 */