blob: 866c82fcec3854b60dcc747a46bf87df4fa4d7a3 [file] [log] [blame]
Damjan Marionc5766222018-04-16 00:18:34 +02001/*
2 * Copyright (c) 2018 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#ifndef included_vector_avx2_h
17#define included_vector_avx2_h
18
19#include <vppinfra/clib.h>
20#include <x86intrin.h>
21
22#define foreach_avx2_vec256i \
Damjan Marion4fce7f72018-07-16 14:18:23 +020023 _(i,8,32,epi8) _(i,16,16,epi16) _(i,32,8,epi32) _(i,64,4,epi64)
Damjan Marionc5766222018-04-16 00:18:34 +020024#define foreach_avx2_vec256u \
Damjan Marion4fce7f72018-07-16 14:18:23 +020025 _(u,8,32,epi8) _(u,16,16,epi16) _(u,32,8,epi32) _(u,64,4,epi64)
Damjan Marionc5766222018-04-16 00:18:34 +020026#define foreach_avx2_vec256f \
27 _(f,32,8,ps) _(f,64,4,pd)
28
Damjan Marion4fce7f72018-07-16 14:18:23 +020029#define _mm256_set1_epi64 _mm256_set1_epi64x
30
Damjan Mariona52e1662018-05-19 00:04:23 +020031/* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
32 is_all_equal */
Damjan Marionc5766222018-04-16 00:18:34 +020033#define _(t, s, c, i) \
34static_always_inline t##s##x##c \
35t##s##x##c##_splat (t##s x) \
36{ return (t##s##x##c) _mm256_set1_##i (x); } \
37\
38static_always_inline t##s##x##c \
39t##s##x##c##_load_unaligned (void *p) \
40{ return (t##s##x##c) _mm256_loadu_si256 (p); } \
41\
42static_always_inline void \
43t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
44{ _mm256_storeu_si256 ((__m256i *) p, (__m256i) v); } \
45\
46static_always_inline int \
47t##s##x##c##_is_all_zero (t##s##x##c x) \
48{ return _mm256_testz_si256 ((__m256i) x, (__m256i) x); } \
49\
50static_always_inline int \
Damjan Marion14864772018-05-22 14:07:47 +020051t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
52{ return t##s##x##c##_is_all_zero (a ^ b); } \
Damjan Marionc5766222018-04-16 00:18:34 +020053\
Damjan Mariona52e1662018-05-19 00:04:23 +020054static_always_inline int \
55t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
Damjan Marion4fce7f72018-07-16 14:18:23 +020056{ return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); } \
57\
58static_always_inline t##s##x##c \
59t##s##x##c##_interleave_lo (t##s##x##c a, t##s##x##c b) \
60{ return (t##s##x##c) _mm256_unpacklo_##i ((__m256i) a, (__m256i) b); } \
61\
62static_always_inline t##s##x##c \
63t##s##x##c##_interleave_hi (t##s##x##c a, t##s##x##c b) \
64{ return (t##s##x##c) _mm256_unpackhi_##i ((__m256i) a, (__m256i) b); } \
65
Damjan Marionc5766222018-04-16 00:18:34 +020066
67foreach_avx2_vec256i foreach_avx2_vec256u
68#undef _
Damjan Mariona52e1662018-05-19 00:04:23 +020069
70always_inline u32x8
Damjan Marionc5766222018-04-16 00:18:34 +020071u32x8_permute (u32x8 v, u32x8 idx)
72{
73 return (u32x8) _mm256_permutevar8x32_epi32 ((__m256i) v, (__m256i) idx);
74}
75
Damjan Marionef0bac72021-04-22 18:08:28 +020076#define u64x4_permute(v, m0, m1, m2, m3) \
77 (u64x4) _mm256_permute4x64_epi64 ( \
78 (__m256i) v, ((m0) | (m1) << 2 | (m2) << 4 | (m3) << 6))
79
Damjan Marion1cf9a162018-05-23 20:21:51 +020080/* _extract_lo, _extract_hi */
Damjan Marion1cf9a162018-05-23 20:21:51 +020081#define _(t1,t2) \
82always_inline t1 \
83t2##_extract_lo (t2 v) \
84{ return (t1) _mm256_extracti128_si256 ((__m256i) v, 0); } \
85\
86always_inline t1 \
87t2##_extract_hi (t2 v) \
88{ return (t1) _mm256_extracti128_si256 ((__m256i) v, 1); } \
89\
90always_inline t2 \
91t2##_insert_lo (t2 v1, t1 v2) \
92{ return (t2) _mm256_inserti128_si256 ((__m256i) v1, (__m128i) v2, 0); }\
93\
94always_inline t2 \
95t2##_insert_hi (t2 v1, t1 v2) \
96{ return (t2) _mm256_inserti128_si256 ((__m256i) v1, (__m128i) v2, 1); }\
Damjan Marionc5766222018-04-16 00:18:34 +020097
Damjan Marion1cf9a162018-05-23 20:21:51 +020098_(u8x16, u8x32)
99_(u16x8, u16x16)
100_(u32x4, u32x8)
101_(u64x2, u64x4)
102#undef _
Damjan Marionc5766222018-04-16 00:18:34 +0200103
Damjan Marion7d14aad2021-05-05 19:31:41 +0200104/* 256 bit packs. */
105#define _(f, t, fn) \
106 always_inline t t##_pack (f lo, f hi) \
107 { \
108 return (t) fn ((__m256i) lo, (__m256i) hi); \
109 }
Damjan Marionee7f0bd2018-05-05 12:30:28 +0200110
Damjan Marion7d14aad2021-05-05 19:31:41 +0200111_ (i16x16, i8x32, _mm256_packs_epi16)
112_ (i16x16, u8x32, _mm256_packus_epi16)
113_ (i32x8, i16x16, _mm256_packs_epi32)
114_ (i32x8, u16x16, _mm256_packus_epi32)
115
116#undef _
Damjan Marionee7f0bd2018-05-05 12:30:28 +0200117
Damjan Marion8c3f8a22018-05-17 21:12:13 +0200118static_always_inline u32
119u8x32_msb_mask (u8x32 v)
120{
121 return _mm256_movemask_epi8 ((__m256i) v);
122}
123
Damjan Marionef0bac72021-04-22 18:08:28 +0200124static_always_inline u32
125i8x32_msb_mask (i8x32 v)
126{
127 return _mm256_movemask_epi8 ((__m256i) v);
128}
129
Damjan Marion90d05bc2020-08-31 17:18:26 +0200130/* _from_ */
Damjan Marionafe56de2018-05-17 12:44:00 +0200131#define _(f,t,i) \
132static_always_inline t \
Damjan Marion90d05bc2020-08-31 17:18:26 +0200133t##_from_##f (f x) \
Damjan Marionafe56de2018-05-17 12:44:00 +0200134{ return (t) _mm256_cvt##i ((__m128i) x); }
135
136_(u16x8, u32x8, epu16_epi32)
137_(u16x8, u64x4, epu16_epi64)
138_(u32x4, u64x4, epu32_epi64)
Lijian.Zhang7e9d5ff2021-04-14 16:12:28 +0800139_ (u8x16, u16x16, epu8_epi16)
Damjan Marionafe56de2018-05-17 12:44:00 +0200140_(u8x16, u32x8, epu8_epi32)
141_(u8x16, u64x4, epu8_epi64)
142_(i16x8, i32x8, epi16_epi32)
143_(i16x8, i64x4, epi16_epi64)
144_(i32x4, i64x4, epi32_epi64)
Lijian.Zhang7e9d5ff2021-04-14 16:12:28 +0800145_ (i8x16, i16x16, epi8_epi16)
Damjan Marionafe56de2018-05-17 12:44:00 +0200146_(i8x16, i32x8, epi8_epi32)
147_(i8x16, i64x4, epi8_epi64)
148#undef _
Damjan Marionafe56de2018-05-17 12:44:00 +0200149
Damjan Mariondd648aa2020-03-12 11:56:00 +0100150static_always_inline u64x4
151u64x4_byte_swap (u64x4 v)
152{
153 u8x32 swap = {
154 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
155 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
156 };
157 return (u64x4) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
158}
159
Damjan Marionc899dac2019-04-16 18:41:01 +0200160static_always_inline u32x8
161u32x8_byte_swap (u32x8 v)
162{
163 u8x32 swap = {
164 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
165 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
166 };
167 return (u32x8) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
168}
169
Damjan Marionbf129f42018-06-27 13:03:26 +0200170static_always_inline u16x16
171u16x16_byte_swap (u16x16 v)
172{
173 u8x32 swap = {
174 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
175 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
176 };
177 return (u16x16) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
178}
179
Damjan Marion94dbf952020-07-15 20:18:39 +0200180#define u8x32_align_right(a, b, imm) \
181 (u8x32) _mm256_alignr_epi8 ((__m256i) a, (__m256i) b, imm)
182
Damjan Marion4dc098f2021-09-22 15:28:29 +0200183#define u64x4_align_right(a, b, imm) \
184 (u64x4) _mm256_alignr_epi64 ((__m256i) a, (__m256i) b, imm)
185
Damjan Marion94dbf952020-07-15 20:18:39 +0200186static_always_inline u32
187u32x8_sum_elts (u32x8 sum8)
188{
189 sum8 += (u32x8) u8x32_align_right (sum8, sum8, 8);
190 sum8 += (u32x8) u8x32_align_right (sum8, sum8, 4);
191 return sum8[0] + sum8[4];
192}
193
Damjan Marionbf129f42018-06-27 13:03:26 +0200194static_always_inline u32x8
195u32x8_hadd (u32x8 v1, u32x8 v2)
196{
197 return (u32x8) _mm256_hadd_epi32 ((__m256i) v1, (__m256i) v2);
198}
199
Damjan Marion88019c42021-12-15 10:17:04 +0000200static_always_inline u32
201u32x8_hxor (u32x8 v)
202{
203 u32x4 v4;
204 v4 = u32x8_extract_lo (v) ^ u32x8_extract_hi (v);
205 v4 ^= (u32x4) u8x16_align_right (v4, v4, 8);
206 v4 ^= (u32x4) u8x16_align_right (v4, v4, 4);
207 return v4[0];
208}
209
Damjan Marionadeaf162023-03-14 18:04:45 +0000210static_always_inline u8x32
211u8x32_xor3 (u8x32 a, u8x32 b, u8x32 c)
212{
213#if __AVX512F__
214 return (u8x32) _mm256_ternarylogic_epi32 ((__m256i) a, (__m256i) b,
215 (__m256i) c, 0x96);
216#endif
217 return a ^ b ^ c;
218}
219
Damjan Marionb47376f2023-03-15 11:42:06 +0000220static_always_inline u8x32
221u8x32_reflect_u8x16 (u8x32 x)
222{
223 static const u8x32 mask = {
224 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
225 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
226 };
227 return (u8x32) _mm256_shuffle_epi8 ((__m256i) x, (__m256i) mask);
228}
229
Damjan Marion08bca802018-06-18 22:21:40 +0200230static_always_inline u16x16
231u16x16_mask_last (u16x16 v, u8 n_last)
232{
233 const u16x16 masks[17] = {
234 {0},
235 {-1},
236 {-1, -1},
237 {-1, -1, -1},
238 {-1, -1, -1, -1},
239 {-1, -1, -1, -1, -1},
240 {-1, -1, -1, -1, -1, -1},
241 {-1, -1, -1, -1, -1, -1, -1},
242 {-1, -1, -1, -1, -1, -1, -1, -1},
243 {-1, -1, -1, -1, -1, -1, -1, -1, -1},
244 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
245 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
246 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
247 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
248 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
249 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
250 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
251 };
252
253 ASSERT (n_last < 17);
254
255 return v & masks[16 - n_last];
256}
257
Damjan Marion69fdfee2018-10-06 14:33:18 +0200258static_always_inline f32x8
259f32x8_from_u32x8 (u32x8 v)
260{
261 return (f32x8) _mm256_cvtepi32_ps ((__m256i) v);
262}
263
264static_always_inline u32x8
265u32x8_from_f32x8 (f32x8 v)
266{
267 return (u32x8) _mm256_cvttps_epi32 ((__m256) v);
268}
269
Damjan Marionc899dac2019-04-16 18:41:01 +0200270#define u32x8_blend(a,b,m) \
271 (u32x8) _mm256_blend_epi32 ((__m256i) a, (__m256i) b, m)
272
Damjan Marion07243572018-11-20 10:06:57 +0100273#define u16x16_blend(v1, v2, mask) \
274 (u16x16) _mm256_blend_epi16 ((__m256i) (v1), (__m256i) (v2), mask)
275
276static_always_inline u64x4
277u64x4_gather (void *p0, void *p1, void *p2, void *p3)
278{
279 u64x4 r = {
280 *(u64 *) p0, *(u64 *) p1, *(u64 *) p2, *(u64 *) p3
281 };
282 return r;
283}
284
285static_always_inline u32x8
286u32x8_gather (void *p0, void *p1, void *p2, void *p3, void *p4, void *p5,
287 void *p6, void *p7)
288{
289 u32x8 r = {
290 *(u32 *) p0, *(u32 *) p1, *(u32 *) p2, *(u32 *) p3,
291 *(u32 *) p4, *(u32 *) p5, *(u32 *) p6, *(u32 *) p7,
292 };
293 return r;
294}
295
296
297static_always_inline void
298u64x4_scatter (u64x4 r, void *p0, void *p1, void *p2, void *p3)
299{
300 *(u64 *) p0 = r[0];
301 *(u64 *) p1 = r[1];
302 *(u64 *) p2 = r[2];
303 *(u64 *) p3 = r[3];
304}
305
306static_always_inline void
307u32x8_scatter (u32x8 r, void *p0, void *p1, void *p2, void *p3, void *p4,
308 void *p5, void *p6, void *p7)
309{
310 *(u32 *) p0 = r[0];
311 *(u32 *) p1 = r[1];
312 *(u32 *) p2 = r[2];
313 *(u32 *) p3 = r[3];
314 *(u32 *) p4 = r[4];
315 *(u32 *) p5 = r[5];
316 *(u32 *) p6 = r[6];
317 *(u32 *) p7 = r[7];
318}
319
320static_always_inline void
321u64x4_scatter_one (u64x4 r, int index, void *p)
322{
323 *(u64 *) p = r[index];
324}
325
326static_always_inline void
327u32x8_scatter_one (u32x8 r, int index, void *p)
328{
329 *(u32 *) p = r[index];
330}
331
Damjan Marion029bff42023-09-29 15:09:11 +0200332#define u32x8_gather_u32(base, indices, scale) \
Damjan Marion2d725c62023-11-13 12:18:24 +0000333 (u32x8) _mm256_i32gather_epi32 ((const int *) base, (__m256i) indices, scale)
Damjan Marion029bff42023-09-29 15:09:11 +0200334
335#ifdef __AVX512F__
336#define u32x8_scatter_u32(base, indices, v, scale) \
337 _mm256_i32scatter_epi32 (base, (__m256i) indices, (__m256i) v, scale)
338#else
339#define u32x8_scatter_u32(base, indices, v, scale) \
340 for (u32 i = 0; i < 8; i++) \
341 *((u32u *) ((u8 *) base + (scale) * (indices)[i])) = (v)[i];
342#endif
343
Damjan Marionc59b9a22019-03-19 15:38:40 +0100344static_always_inline u8x32
Damjan Marionc59b9a22019-03-19 15:38:40 +0100345u8x32_blend (u8x32 v1, u8x32 v2, u8x32 mask)
346{
347 return (u8x32) _mm256_blendv_epi8 ((__m256i) v1, (__m256i) v2,
348 (__m256i) mask);
349}
350
Damjan Marionb47376f2023-03-15 11:42:06 +0000351#define u8x32_word_shift_left(a, n) \
352 (u8x32) _mm256_bslli_epi128 ((__m256i) a, n)
353#define u8x32_word_shift_right(a, n) \
354 (u8x32) _mm256_bsrli_epi128 ((__m256i) a, n)
355
Damjan Marion9f7e33d2019-04-08 10:14:51 +0200356#define u32x8_permute_lanes(a, b, m) \
357 (u32x8) _mm256_permute2x128_si256 ((__m256i) a, (__m256i) b, m)
358#define u64x4_permute_lanes(a, b, m) \
359 (u64x4) _mm256_permute2x128_si256 ((__m256i) a, (__m256i) b, m)
360
Damjan Marionc899dac2019-04-16 18:41:01 +0200361static_always_inline u32x8
362u32x8_min (u32x8 a, u32x8 b)
363{
364 return (u32x8) _mm256_min_epu32 ((__m256i) a, (__m256i) b);
365}
366
367static_always_inline u32
368u32x8_min_scalar (u32x8 v)
369{
370 return u32x4_min_scalar (u32x4_min (u32x8_extract_lo (v),
371 u32x8_extract_hi (v)));
372}
373
Damjan Marion9f7e33d2019-04-08 10:14:51 +0200374static_always_inline void
375u32x8_transpose (u32x8 a[8])
376{
377 u64x4 r[8], x, y;
378
379 r[0] = (u64x4) u32x8_interleave_lo (a[0], a[1]);
380 r[1] = (u64x4) u32x8_interleave_hi (a[0], a[1]);
381 r[2] = (u64x4) u32x8_interleave_lo (a[2], a[3]);
382 r[3] = (u64x4) u32x8_interleave_hi (a[2], a[3]);
383 r[4] = (u64x4) u32x8_interleave_lo (a[4], a[5]);
384 r[5] = (u64x4) u32x8_interleave_hi (a[4], a[5]);
385 r[6] = (u64x4) u32x8_interleave_lo (a[6], a[7]);
386 r[7] = (u64x4) u32x8_interleave_hi (a[6], a[7]);
387
388 x = u64x4_interleave_lo (r[0], r[2]);
389 y = u64x4_interleave_lo (r[4], r[6]);
390 a[0] = u32x8_permute_lanes (x, y, 0x20);
391 a[4] = u32x8_permute_lanes (x, y, 0x31);
392
393 x = u64x4_interleave_hi (r[0], r[2]);
394 y = u64x4_interleave_hi (r[4], r[6]);
395 a[1] = u32x8_permute_lanes (x, y, 0x20);
396 a[5] = u32x8_permute_lanes (x, y, 0x31);
397
398 x = u64x4_interleave_lo (r[1], r[3]);
399 y = u64x4_interleave_lo (r[5], r[7]);
400 a[2] = u32x8_permute_lanes (x, y, 0x20);
401 a[6] = u32x8_permute_lanes (x, y, 0x31);
402
403 x = u64x4_interleave_hi (r[1], r[3]);
404 y = u64x4_interleave_hi (r[5], r[7]);
405 a[3] = u32x8_permute_lanes (x, y, 0x20);
406 a[7] = u32x8_permute_lanes (x, y, 0x31);
407}
408
409static_always_inline void
410u64x4_transpose (u64x4 a[8])
411{
412 u64x4 r[4];
413
414 r[0] = u64x4_interleave_lo (a[0], a[1]);
415 r[1] = u64x4_interleave_hi (a[0], a[1]);
416 r[2] = u64x4_interleave_lo (a[2], a[3]);
417 r[3] = u64x4_interleave_hi (a[2], a[3]);
418
419 a[0] = u64x4_permute_lanes (r[0], r[2], 0x20);
420 a[1] = u64x4_permute_lanes (r[1], r[3], 0x20);
421 a[2] = u64x4_permute_lanes (r[0], r[2], 0x31);
422 a[3] = u64x4_permute_lanes (r[1], r[3], 0x31);
423}
424
Damjan Marionadeaf162023-03-14 18:04:45 +0000425static_always_inline u8x32
426u8x32_splat_u8x16 (u8x16 a)
427{
428 return (u8x32) _mm256_broadcastsi128_si256 ((__m128i) a);
429}
430
Damjan Marionb47376f2023-03-15 11:42:06 +0000431static_always_inline u32x8
432u32x8_splat_u32x4 (u32x4 a)
433{
434 return (u32x8) _mm256_broadcastsi128_si256 ((__m128i) a);
435}
436
Damjan Marion029bff42023-09-29 15:09:11 +0200437static_always_inline u64x4
438u64x4_splat_u64x2 (u64x2 a)
439{
440 return (u64x4) _mm256_broadcastsi128_si256 ((__m128i) a);
441}
442
Damjan Marionb47376f2023-03-15 11:42:06 +0000443static_always_inline u8x32
444u8x32_load_partial (u8 *data, uword n)
445{
446#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
447 return u8x32_mask_load_zero (data, pow2_mask (n));
448#else
449 u8x32 r = {};
450 if (n > 16)
451 {
452 r = u8x32_insert_lo (r, *(u8x16u *) data);
453 r = u8x32_insert_hi (r, u8x16_load_partial (data + 16, n - 16));
454 }
455 else
456 r = u8x32_insert_lo (r, u8x16_load_partial (data, n));
457 return r;
458#endif
459}
460
461static_always_inline void
462u8x32_store_partial (u8x32 r, u8 *data, uword n)
463{
464#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
465 u8x32_mask_store (r, data, pow2_mask (n));
466#else
467 if (n > 16)
468 {
469 *(u8x16u *) data = u8x32_extract_lo (r);
470 u8x16_store_partial (u8x32_extract_hi (r), data + 16, n - 16);
471 }
472 else
473 u8x16_store_partial (u8x32_extract_lo (r), data, n);
474#endif
475}
476
Damjan Marionc5766222018-04-16 00:18:34 +0200477#endif /* included_vector_avx2_h */
478
479/*
480 * fd.io coding-style-patch-verification: ON
481 *
482 * Local Variables:
483 * eval: (c-set-style "gnu")
484 * End:
485 */