blob: b832681632c075a815247465fcb46be4211dc958 [file] [log] [blame]
Damjan Marionc5766222018-04-16 00:18:34 +02001/*
2 * Copyright (c) 2018 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#ifndef included_vector_avx2_h
17#define included_vector_avx2_h
18
19#include <vppinfra/clib.h>
20#include <x86intrin.h>
21
Damjan Mariona52e1662018-05-19 00:04:23 +020022/* *INDENT-OFF* */
Damjan Marionc5766222018-04-16 00:18:34 +020023#define foreach_avx2_vec256i \
Damjan Marion4fce7f72018-07-16 14:18:23 +020024 _(i,8,32,epi8) _(i,16,16,epi16) _(i,32,8,epi32) _(i,64,4,epi64)
Damjan Marionc5766222018-04-16 00:18:34 +020025#define foreach_avx2_vec256u \
Damjan Marion4fce7f72018-07-16 14:18:23 +020026 _(u,8,32,epi8) _(u,16,16,epi16) _(u,32,8,epi32) _(u,64,4,epi64)
Damjan Marionc5766222018-04-16 00:18:34 +020027#define foreach_avx2_vec256f \
28 _(f,32,8,ps) _(f,64,4,pd)
29
Damjan Marion4fce7f72018-07-16 14:18:23 +020030#define _mm256_set1_epi64 _mm256_set1_epi64x
31
Damjan Mariona52e1662018-05-19 00:04:23 +020032/* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
33 is_all_equal */
Damjan Marionc5766222018-04-16 00:18:34 +020034#define _(t, s, c, i) \
35static_always_inline t##s##x##c \
36t##s##x##c##_splat (t##s x) \
37{ return (t##s##x##c) _mm256_set1_##i (x); } \
38\
39static_always_inline t##s##x##c \
40t##s##x##c##_load_unaligned (void *p) \
41{ return (t##s##x##c) _mm256_loadu_si256 (p); } \
42\
43static_always_inline void \
44t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
45{ _mm256_storeu_si256 ((__m256i *) p, (__m256i) v); } \
46\
47static_always_inline int \
48t##s##x##c##_is_all_zero (t##s##x##c x) \
49{ return _mm256_testz_si256 ((__m256i) x, (__m256i) x); } \
50\
51static_always_inline int \
Damjan Marion14864772018-05-22 14:07:47 +020052t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
53{ return t##s##x##c##_is_all_zero (a ^ b); } \
Damjan Marionc5766222018-04-16 00:18:34 +020054\
Damjan Mariona52e1662018-05-19 00:04:23 +020055static_always_inline int \
56t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
Damjan Marion4fce7f72018-07-16 14:18:23 +020057{ return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); } \
58\
59static_always_inline t##s##x##c \
60t##s##x##c##_interleave_lo (t##s##x##c a, t##s##x##c b) \
61{ return (t##s##x##c) _mm256_unpacklo_##i ((__m256i) a, (__m256i) b); } \
62\
63static_always_inline t##s##x##c \
64t##s##x##c##_interleave_hi (t##s##x##c a, t##s##x##c b) \
65{ return (t##s##x##c) _mm256_unpackhi_##i ((__m256i) a, (__m256i) b); } \
66
Damjan Marionc5766222018-04-16 00:18:34 +020067
68foreach_avx2_vec256i foreach_avx2_vec256u
69#undef _
Damjan Mariona52e1662018-05-19 00:04:23 +020070/* *INDENT-ON* */
71
72always_inline u32x8
Damjan Marionc5766222018-04-16 00:18:34 +020073u32x8_permute (u32x8 v, u32x8 idx)
74{
75 return (u32x8) _mm256_permutevar8x32_epi32 ((__m256i) v, (__m256i) idx);
76}
77
Damjan Marionef0bac72021-04-22 18:08:28 +020078#define u64x4_permute(v, m0, m1, m2, m3) \
79 (u64x4) _mm256_permute4x64_epi64 ( \
80 (__m256i) v, ((m0) | (m1) << 2 | (m2) << 4 | (m3) << 6))
81
Damjan Marion1cf9a162018-05-23 20:21:51 +020082/* _extract_lo, _extract_hi */
83/* *INDENT-OFF* */
84#define _(t1,t2) \
85always_inline t1 \
86t2##_extract_lo (t2 v) \
87{ return (t1) _mm256_extracti128_si256 ((__m256i) v, 0); } \
88\
89always_inline t1 \
90t2##_extract_hi (t2 v) \
91{ return (t1) _mm256_extracti128_si256 ((__m256i) v, 1); } \
92\
93always_inline t2 \
94t2##_insert_lo (t2 v1, t1 v2) \
95{ return (t2) _mm256_inserti128_si256 ((__m256i) v1, (__m128i) v2, 0); }\
96\
97always_inline t2 \
98t2##_insert_hi (t2 v1, t1 v2) \
99{ return (t2) _mm256_inserti128_si256 ((__m256i) v1, (__m128i) v2, 1); }\
Damjan Marionc5766222018-04-16 00:18:34 +0200100
Damjan Marion1cf9a162018-05-23 20:21:51 +0200101_(u8x16, u8x32)
102_(u16x8, u16x16)
103_(u32x4, u32x8)
104_(u64x2, u64x4)
105#undef _
106/* *INDENT-ON* */
Damjan Marionc5766222018-04-16 00:18:34 +0200107
Damjan Marion7d14aad2021-05-05 19:31:41 +0200108/* 256 bit packs. */
109#define _(f, t, fn) \
110 always_inline t t##_pack (f lo, f hi) \
111 { \
112 return (t) fn ((__m256i) lo, (__m256i) hi); \
113 }
Damjan Marionee7f0bd2018-05-05 12:30:28 +0200114
Damjan Marion7d14aad2021-05-05 19:31:41 +0200115_ (i16x16, i8x32, _mm256_packs_epi16)
116_ (i16x16, u8x32, _mm256_packus_epi16)
117_ (i32x8, i16x16, _mm256_packs_epi32)
118_ (i32x8, u16x16, _mm256_packus_epi32)
119
120#undef _
Damjan Marionee7f0bd2018-05-05 12:30:28 +0200121
Damjan Marion8c3f8a22018-05-17 21:12:13 +0200122static_always_inline u32
123u8x32_msb_mask (u8x32 v)
124{
125 return _mm256_movemask_epi8 ((__m256i) v);
126}
127
Damjan Marionef0bac72021-04-22 18:08:28 +0200128static_always_inline u32
129i8x32_msb_mask (i8x32 v)
130{
131 return _mm256_movemask_epi8 ((__m256i) v);
132}
133
Damjan Marion90d05bc2020-08-31 17:18:26 +0200134/* _from_ */
Damjan Marionafe56de2018-05-17 12:44:00 +0200135/* *INDENT-OFF* */
136#define _(f,t,i) \
137static_always_inline t \
Damjan Marion90d05bc2020-08-31 17:18:26 +0200138t##_from_##f (f x) \
Damjan Marionafe56de2018-05-17 12:44:00 +0200139{ return (t) _mm256_cvt##i ((__m128i) x); }
140
141_(u16x8, u32x8, epu16_epi32)
142_(u16x8, u64x4, epu16_epi64)
143_(u32x4, u64x4, epu32_epi64)
Lijian.Zhang7e9d5ff2021-04-14 16:12:28 +0800144_ (u8x16, u16x16, epu8_epi16)
Damjan Marionafe56de2018-05-17 12:44:00 +0200145_(u8x16, u32x8, epu8_epi32)
146_(u8x16, u64x4, epu8_epi64)
147_(i16x8, i32x8, epi16_epi32)
148_(i16x8, i64x4, epi16_epi64)
149_(i32x4, i64x4, epi32_epi64)
Lijian.Zhang7e9d5ff2021-04-14 16:12:28 +0800150_ (i8x16, i16x16, epi8_epi16)
Damjan Marionafe56de2018-05-17 12:44:00 +0200151_(i8x16, i32x8, epi8_epi32)
152_(i8x16, i64x4, epi8_epi64)
153#undef _
154/* *INDENT-ON* */
155
Damjan Mariondd648aa2020-03-12 11:56:00 +0100156static_always_inline u64x4
157u64x4_byte_swap (u64x4 v)
158{
159 u8x32 swap = {
160 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
161 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
162 };
163 return (u64x4) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
164}
165
Damjan Marionc899dac2019-04-16 18:41:01 +0200166static_always_inline u32x8
167u32x8_byte_swap (u32x8 v)
168{
169 u8x32 swap = {
170 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
171 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
172 };
173 return (u32x8) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
174}
175
Damjan Marionbf129f42018-06-27 13:03:26 +0200176static_always_inline u16x16
177u16x16_byte_swap (u16x16 v)
178{
179 u8x32 swap = {
180 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
181 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
182 };
183 return (u16x16) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
184}
185
Damjan Marion94dbf952020-07-15 20:18:39 +0200186#define u8x32_align_right(a, b, imm) \
187 (u8x32) _mm256_alignr_epi8 ((__m256i) a, (__m256i) b, imm)
188
Damjan Marion4dc098f2021-09-22 15:28:29 +0200189#define u64x4_align_right(a, b, imm) \
190 (u64x4) _mm256_alignr_epi64 ((__m256i) a, (__m256i) b, imm)
191
Damjan Marion94dbf952020-07-15 20:18:39 +0200192static_always_inline u32
193u32x8_sum_elts (u32x8 sum8)
194{
195 sum8 += (u32x8) u8x32_align_right (sum8, sum8, 8);
196 sum8 += (u32x8) u8x32_align_right (sum8, sum8, 4);
197 return sum8[0] + sum8[4];
198}
199
Damjan Marionbf129f42018-06-27 13:03:26 +0200200static_always_inline u32x8
201u32x8_hadd (u32x8 v1, u32x8 v2)
202{
203 return (u32x8) _mm256_hadd_epi32 ((__m256i) v1, (__m256i) v2);
204}
205
Damjan Marion88019c42021-12-15 10:17:04 +0000206static_always_inline u32
207u32x8_hxor (u32x8 v)
208{
209 u32x4 v4;
210 v4 = u32x8_extract_lo (v) ^ u32x8_extract_hi (v);
211 v4 ^= (u32x4) u8x16_align_right (v4, v4, 8);
212 v4 ^= (u32x4) u8x16_align_right (v4, v4, 4);
213 return v4[0];
214}
215
Damjan Marionadeaf162023-03-14 18:04:45 +0000216static_always_inline u8x32
217u8x32_xor3 (u8x32 a, u8x32 b, u8x32 c)
218{
219#if __AVX512F__
220 return (u8x32) _mm256_ternarylogic_epi32 ((__m256i) a, (__m256i) b,
221 (__m256i) c, 0x96);
222#endif
223 return a ^ b ^ c;
224}
225
Damjan Marionb47376f2023-03-15 11:42:06 +0000226static_always_inline u8x32
227u8x32_reflect_u8x16 (u8x32 x)
228{
229 static const u8x32 mask = {
230 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
231 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
232 };
233 return (u8x32) _mm256_shuffle_epi8 ((__m256i) x, (__m256i) mask);
234}
235
Damjan Marion08bca802018-06-18 22:21:40 +0200236static_always_inline u16x16
237u16x16_mask_last (u16x16 v, u8 n_last)
238{
239 const u16x16 masks[17] = {
240 {0},
241 {-1},
242 {-1, -1},
243 {-1, -1, -1},
244 {-1, -1, -1, -1},
245 {-1, -1, -1, -1, -1},
246 {-1, -1, -1, -1, -1, -1},
247 {-1, -1, -1, -1, -1, -1, -1},
248 {-1, -1, -1, -1, -1, -1, -1, -1},
249 {-1, -1, -1, -1, -1, -1, -1, -1, -1},
250 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
251 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
252 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
253 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
254 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
255 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
256 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
257 };
258
259 ASSERT (n_last < 17);
260
261 return v & masks[16 - n_last];
262}
263
Damjan Marion69fdfee2018-10-06 14:33:18 +0200264static_always_inline f32x8
265f32x8_from_u32x8 (u32x8 v)
266{
267 return (f32x8) _mm256_cvtepi32_ps ((__m256i) v);
268}
269
270static_always_inline u32x8
271u32x8_from_f32x8 (f32x8 v)
272{
273 return (u32x8) _mm256_cvttps_epi32 ((__m256) v);
274}
275
Damjan Marionc899dac2019-04-16 18:41:01 +0200276#define u32x8_blend(a,b,m) \
277 (u32x8) _mm256_blend_epi32 ((__m256i) a, (__m256i) b, m)
278
Damjan Marion07243572018-11-20 10:06:57 +0100279#define u16x16_blend(v1, v2, mask) \
280 (u16x16) _mm256_blend_epi16 ((__m256i) (v1), (__m256i) (v2), mask)
281
282static_always_inline u64x4
283u64x4_gather (void *p0, void *p1, void *p2, void *p3)
284{
285 u64x4 r = {
286 *(u64 *) p0, *(u64 *) p1, *(u64 *) p2, *(u64 *) p3
287 };
288 return r;
289}
290
291static_always_inline u32x8
292u32x8_gather (void *p0, void *p1, void *p2, void *p3, void *p4, void *p5,
293 void *p6, void *p7)
294{
295 u32x8 r = {
296 *(u32 *) p0, *(u32 *) p1, *(u32 *) p2, *(u32 *) p3,
297 *(u32 *) p4, *(u32 *) p5, *(u32 *) p6, *(u32 *) p7,
298 };
299 return r;
300}
301
302
303static_always_inline void
304u64x4_scatter (u64x4 r, void *p0, void *p1, void *p2, void *p3)
305{
306 *(u64 *) p0 = r[0];
307 *(u64 *) p1 = r[1];
308 *(u64 *) p2 = r[2];
309 *(u64 *) p3 = r[3];
310}
311
312static_always_inline void
313u32x8_scatter (u32x8 r, void *p0, void *p1, void *p2, void *p3, void *p4,
314 void *p5, void *p6, void *p7)
315{
316 *(u32 *) p0 = r[0];
317 *(u32 *) p1 = r[1];
318 *(u32 *) p2 = r[2];
319 *(u32 *) p3 = r[3];
320 *(u32 *) p4 = r[4];
321 *(u32 *) p5 = r[5];
322 *(u32 *) p6 = r[6];
323 *(u32 *) p7 = r[7];
324}
325
326static_always_inline void
327u64x4_scatter_one (u64x4 r, int index, void *p)
328{
329 *(u64 *) p = r[index];
330}
331
332static_always_inline void
333u32x8_scatter_one (u32x8 r, int index, void *p)
334{
335 *(u32 *) p = r[index];
336}
337
Damjan Marion029bff42023-09-29 15:09:11 +0200338#define u32x8_gather_u32(base, indices, scale) \
Damjan Marion2d725c62023-11-13 12:18:24 +0000339 (u32x8) _mm256_i32gather_epi32 ((const int *) base, (__m256i) indices, scale)
Damjan Marion029bff42023-09-29 15:09:11 +0200340
341#ifdef __AVX512F__
342#define u32x8_scatter_u32(base, indices, v, scale) \
343 _mm256_i32scatter_epi32 (base, (__m256i) indices, (__m256i) v, scale)
344#else
345#define u32x8_scatter_u32(base, indices, v, scale) \
346 for (u32 i = 0; i < 8; i++) \
347 *((u32u *) ((u8 *) base + (scale) * (indices)[i])) = (v)[i];
348#endif
349
Damjan Marionc59b9a22019-03-19 15:38:40 +0100350static_always_inline u8x32
Damjan Marionc59b9a22019-03-19 15:38:40 +0100351u8x32_blend (u8x32 v1, u8x32 v2, u8x32 mask)
352{
353 return (u8x32) _mm256_blendv_epi8 ((__m256i) v1, (__m256i) v2,
354 (__m256i) mask);
355}
356
Damjan Marionb47376f2023-03-15 11:42:06 +0000357#define u8x32_word_shift_left(a, n) \
358 (u8x32) _mm256_bslli_epi128 ((__m256i) a, n)
359#define u8x32_word_shift_right(a, n) \
360 (u8x32) _mm256_bsrli_epi128 ((__m256i) a, n)
361
Damjan Marion9f7e33d2019-04-08 10:14:51 +0200362#define u32x8_permute_lanes(a, b, m) \
363 (u32x8) _mm256_permute2x128_si256 ((__m256i) a, (__m256i) b, m)
364#define u64x4_permute_lanes(a, b, m) \
365 (u64x4) _mm256_permute2x128_si256 ((__m256i) a, (__m256i) b, m)
366
Damjan Marionc899dac2019-04-16 18:41:01 +0200367static_always_inline u32x8
368u32x8_min (u32x8 a, u32x8 b)
369{
370 return (u32x8) _mm256_min_epu32 ((__m256i) a, (__m256i) b);
371}
372
373static_always_inline u32
374u32x8_min_scalar (u32x8 v)
375{
376 return u32x4_min_scalar (u32x4_min (u32x8_extract_lo (v),
377 u32x8_extract_hi (v)));
378}
379
Damjan Marion9f7e33d2019-04-08 10:14:51 +0200380static_always_inline void
381u32x8_transpose (u32x8 a[8])
382{
383 u64x4 r[8], x, y;
384
385 r[0] = (u64x4) u32x8_interleave_lo (a[0], a[1]);
386 r[1] = (u64x4) u32x8_interleave_hi (a[0], a[1]);
387 r[2] = (u64x4) u32x8_interleave_lo (a[2], a[3]);
388 r[3] = (u64x4) u32x8_interleave_hi (a[2], a[3]);
389 r[4] = (u64x4) u32x8_interleave_lo (a[4], a[5]);
390 r[5] = (u64x4) u32x8_interleave_hi (a[4], a[5]);
391 r[6] = (u64x4) u32x8_interleave_lo (a[6], a[7]);
392 r[7] = (u64x4) u32x8_interleave_hi (a[6], a[7]);
393
394 x = u64x4_interleave_lo (r[0], r[2]);
395 y = u64x4_interleave_lo (r[4], r[6]);
396 a[0] = u32x8_permute_lanes (x, y, 0x20);
397 a[4] = u32x8_permute_lanes (x, y, 0x31);
398
399 x = u64x4_interleave_hi (r[0], r[2]);
400 y = u64x4_interleave_hi (r[4], r[6]);
401 a[1] = u32x8_permute_lanes (x, y, 0x20);
402 a[5] = u32x8_permute_lanes (x, y, 0x31);
403
404 x = u64x4_interleave_lo (r[1], r[3]);
405 y = u64x4_interleave_lo (r[5], r[7]);
406 a[2] = u32x8_permute_lanes (x, y, 0x20);
407 a[6] = u32x8_permute_lanes (x, y, 0x31);
408
409 x = u64x4_interleave_hi (r[1], r[3]);
410 y = u64x4_interleave_hi (r[5], r[7]);
411 a[3] = u32x8_permute_lanes (x, y, 0x20);
412 a[7] = u32x8_permute_lanes (x, y, 0x31);
413}
414
415static_always_inline void
416u64x4_transpose (u64x4 a[8])
417{
418 u64x4 r[4];
419
420 r[0] = u64x4_interleave_lo (a[0], a[1]);
421 r[1] = u64x4_interleave_hi (a[0], a[1]);
422 r[2] = u64x4_interleave_lo (a[2], a[3]);
423 r[3] = u64x4_interleave_hi (a[2], a[3]);
424
425 a[0] = u64x4_permute_lanes (r[0], r[2], 0x20);
426 a[1] = u64x4_permute_lanes (r[1], r[3], 0x20);
427 a[2] = u64x4_permute_lanes (r[0], r[2], 0x31);
428 a[3] = u64x4_permute_lanes (r[1], r[3], 0x31);
429}
430
Damjan Marionadeaf162023-03-14 18:04:45 +0000431static_always_inline u8x32
432u8x32_splat_u8x16 (u8x16 a)
433{
434 return (u8x32) _mm256_broadcastsi128_si256 ((__m128i) a);
435}
436
Damjan Marionb47376f2023-03-15 11:42:06 +0000437static_always_inline u32x8
438u32x8_splat_u32x4 (u32x4 a)
439{
440 return (u32x8) _mm256_broadcastsi128_si256 ((__m128i) a);
441}
442
Damjan Marion029bff42023-09-29 15:09:11 +0200443static_always_inline u64x4
444u64x4_splat_u64x2 (u64x2 a)
445{
446 return (u64x4) _mm256_broadcastsi128_si256 ((__m128i) a);
447}
448
Damjan Marionb47376f2023-03-15 11:42:06 +0000449static_always_inline u8x32
450u8x32_load_partial (u8 *data, uword n)
451{
452#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
453 return u8x32_mask_load_zero (data, pow2_mask (n));
454#else
455 u8x32 r = {};
456 if (n > 16)
457 {
458 r = u8x32_insert_lo (r, *(u8x16u *) data);
459 r = u8x32_insert_hi (r, u8x16_load_partial (data + 16, n - 16));
460 }
461 else
462 r = u8x32_insert_lo (r, u8x16_load_partial (data, n));
463 return r;
464#endif
465}
466
467static_always_inline void
468u8x32_store_partial (u8x32 r, u8 *data, uword n)
469{
470#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
471 u8x32_mask_store (r, data, pow2_mask (n));
472#else
473 if (n > 16)
474 {
475 *(u8x16u *) data = u8x32_extract_lo (r);
476 u8x16_store_partial (u8x32_extract_hi (r), data + 16, n - 16);
477 }
478 else
479 u8x16_store_partial (u8x32_extract_lo (r), data, n);
480#endif
481}
482
Damjan Marionc5766222018-04-16 00:18:34 +0200483#endif /* included_vector_avx2_h */
484
485/*
486 * fd.io coding-style-patch-verification: ON
487 *
488 * Local Variables:
489 * eval: (c-set-style "gnu")
490 * End:
491 */