blob: 584bd207b279bbdc606f1620c60b6e5736f611f3 [file] [log] [blame]
Damjan Marionc5766222018-04-16 00:18:34 +02001/*
2 * Copyright (c) 2018 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#ifndef included_vector_avx2_h
17#define included_vector_avx2_h
18
19#include <vppinfra/clib.h>
20#include <x86intrin.h>
21
Damjan Mariona52e1662018-05-19 00:04:23 +020022/* *INDENT-OFF* */
Damjan Marionc5766222018-04-16 00:18:34 +020023#define foreach_avx2_vec256i \
Damjan Marion4fce7f72018-07-16 14:18:23 +020024 _(i,8,32,epi8) _(i,16,16,epi16) _(i,32,8,epi32) _(i,64,4,epi64)
Damjan Marionc5766222018-04-16 00:18:34 +020025#define foreach_avx2_vec256u \
Damjan Marion4fce7f72018-07-16 14:18:23 +020026 _(u,8,32,epi8) _(u,16,16,epi16) _(u,32,8,epi32) _(u,64,4,epi64)
Damjan Marionc5766222018-04-16 00:18:34 +020027#define foreach_avx2_vec256f \
28 _(f,32,8,ps) _(f,64,4,pd)
29
Damjan Marion4fce7f72018-07-16 14:18:23 +020030#define _mm256_set1_epi64 _mm256_set1_epi64x
31
Damjan Mariona52e1662018-05-19 00:04:23 +020032/* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
33 is_all_equal */
Damjan Marionc5766222018-04-16 00:18:34 +020034#define _(t, s, c, i) \
35static_always_inline t##s##x##c \
36t##s##x##c##_splat (t##s x) \
37{ return (t##s##x##c) _mm256_set1_##i (x); } \
38\
39static_always_inline t##s##x##c \
40t##s##x##c##_load_unaligned (void *p) \
41{ return (t##s##x##c) _mm256_loadu_si256 (p); } \
42\
43static_always_inline void \
44t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
45{ _mm256_storeu_si256 ((__m256i *) p, (__m256i) v); } \
46\
47static_always_inline int \
48t##s##x##c##_is_all_zero (t##s##x##c x) \
49{ return _mm256_testz_si256 ((__m256i) x, (__m256i) x); } \
50\
51static_always_inline int \
Damjan Marion14864772018-05-22 14:07:47 +020052t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
53{ return t##s##x##c##_is_all_zero (a ^ b); } \
Damjan Marionc5766222018-04-16 00:18:34 +020054\
Damjan Mariona52e1662018-05-19 00:04:23 +020055static_always_inline int \
56t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
Damjan Marion4fce7f72018-07-16 14:18:23 +020057{ return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); } \
58\
59static_always_inline t##s##x##c \
60t##s##x##c##_interleave_lo (t##s##x##c a, t##s##x##c b) \
61{ return (t##s##x##c) _mm256_unpacklo_##i ((__m256i) a, (__m256i) b); } \
62\
63static_always_inline t##s##x##c \
64t##s##x##c##_interleave_hi (t##s##x##c a, t##s##x##c b) \
65{ return (t##s##x##c) _mm256_unpackhi_##i ((__m256i) a, (__m256i) b); } \
66
Damjan Marionc5766222018-04-16 00:18:34 +020067
68foreach_avx2_vec256i foreach_avx2_vec256u
69#undef _
Damjan Mariona52e1662018-05-19 00:04:23 +020070/* *INDENT-ON* */
71
72always_inline u32x8
Damjan Marionc5766222018-04-16 00:18:34 +020073u32x8_permute (u32x8 v, u32x8 idx)
74{
75 return (u32x8) _mm256_permutevar8x32_epi32 ((__m256i) v, (__m256i) idx);
76}
77
Damjan Marionef0bac72021-04-22 18:08:28 +020078#define u64x4_permute(v, m0, m1, m2, m3) \
79 (u64x4) _mm256_permute4x64_epi64 ( \
80 (__m256i) v, ((m0) | (m1) << 2 | (m2) << 4 | (m3) << 6))
81
Damjan Marion1cf9a162018-05-23 20:21:51 +020082/* _extract_lo, _extract_hi */
83/* *INDENT-OFF* */
84#define _(t1,t2) \
85always_inline t1 \
86t2##_extract_lo (t2 v) \
87{ return (t1) _mm256_extracti128_si256 ((__m256i) v, 0); } \
88\
89always_inline t1 \
90t2##_extract_hi (t2 v) \
91{ return (t1) _mm256_extracti128_si256 ((__m256i) v, 1); } \
92\
93always_inline t2 \
94t2##_insert_lo (t2 v1, t1 v2) \
95{ return (t2) _mm256_inserti128_si256 ((__m256i) v1, (__m128i) v2, 0); }\
96\
97always_inline t2 \
98t2##_insert_hi (t2 v1, t1 v2) \
99{ return (t2) _mm256_inserti128_si256 ((__m256i) v1, (__m128i) v2, 1); }\
Damjan Marionc5766222018-04-16 00:18:34 +0200100
Damjan Marion1cf9a162018-05-23 20:21:51 +0200101_(u8x16, u8x32)
102_(u16x8, u16x16)
103_(u32x4, u32x8)
104_(u64x2, u64x4)
105#undef _
106/* *INDENT-ON* */
Damjan Marionc5766222018-04-16 00:18:34 +0200107
Damjan Marionef0bac72021-04-22 18:08:28 +0200108always_inline u8x32
109u16x16_pack (u16x16 lo, u16x16 hi)
110{
111 return (u8x32) _mm256_packus_epi16 ((__m256i) lo, (__m256i) hi);
112}
Damjan Marionee7f0bd2018-05-05 12:30:28 +0200113
Damjan Marionef0bac72021-04-22 18:08:28 +0200114always_inline i8x32
115i16x16_pack (i16x16 lo, i16x16 hi)
116{
117 return (i8x32) _mm256_packs_epi16 ((__m256i) lo, (__m256i) hi);
118}
Damjan Marionee7f0bd2018-05-05 12:30:28 +0200119
Damjan Marion8c3f8a22018-05-17 21:12:13 +0200120static_always_inline u32
121u8x32_msb_mask (u8x32 v)
122{
123 return _mm256_movemask_epi8 ((__m256i) v);
124}
125
Damjan Marionef0bac72021-04-22 18:08:28 +0200126static_always_inline u32
127i8x32_msb_mask (i8x32 v)
128{
129 return _mm256_movemask_epi8 ((__m256i) v);
130}
131
Damjan Marion90d05bc2020-08-31 17:18:26 +0200132/* _from_ */
Damjan Marionafe56de2018-05-17 12:44:00 +0200133/* *INDENT-OFF* */
134#define _(f,t,i) \
135static_always_inline t \
Damjan Marion90d05bc2020-08-31 17:18:26 +0200136t##_from_##f (f x) \
Damjan Marionafe56de2018-05-17 12:44:00 +0200137{ return (t) _mm256_cvt##i ((__m128i) x); }
138
139_(u16x8, u32x8, epu16_epi32)
140_(u16x8, u64x4, epu16_epi64)
141_(u32x4, u64x4, epu32_epi64)
Lijian.Zhang7e9d5ff2021-04-14 16:12:28 +0800142_ (u8x16, u16x16, epu8_epi16)
Damjan Marionafe56de2018-05-17 12:44:00 +0200143_(u8x16, u32x8, epu8_epi32)
144_(u8x16, u64x4, epu8_epi64)
145_(i16x8, i32x8, epi16_epi32)
146_(i16x8, i64x4, epi16_epi64)
147_(i32x4, i64x4, epi32_epi64)
Lijian.Zhang7e9d5ff2021-04-14 16:12:28 +0800148_ (i8x16, i16x16, epi8_epi16)
Damjan Marionafe56de2018-05-17 12:44:00 +0200149_(i8x16, i32x8, epi8_epi32)
150_(i8x16, i64x4, epi8_epi64)
151#undef _
152/* *INDENT-ON* */
153
Damjan Mariondd648aa2020-03-12 11:56:00 +0100154static_always_inline u64x4
155u64x4_byte_swap (u64x4 v)
156{
157 u8x32 swap = {
158 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
159 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
160 };
161 return (u64x4) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
162}
163
Damjan Marionc899dac2019-04-16 18:41:01 +0200164static_always_inline u32x8
165u32x8_byte_swap (u32x8 v)
166{
167 u8x32 swap = {
168 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
169 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
170 };
171 return (u32x8) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
172}
173
Damjan Marionbf129f42018-06-27 13:03:26 +0200174static_always_inline u16x16
175u16x16_byte_swap (u16x16 v)
176{
177 u8x32 swap = {
178 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
179 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
180 };
181 return (u16x16) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
182}
183
Damjan Marion94dbf952020-07-15 20:18:39 +0200184static_always_inline u8x32
185u8x32_shuffle (u8x32 v, u8x32 m)
186{
187 return (u8x32) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) m);
188}
189
190#define u8x32_align_right(a, b, imm) \
191 (u8x32) _mm256_alignr_epi8 ((__m256i) a, (__m256i) b, imm)
192
193static_always_inline u32
194u32x8_sum_elts (u32x8 sum8)
195{
196 sum8 += (u32x8) u8x32_align_right (sum8, sum8, 8);
197 sum8 += (u32x8) u8x32_align_right (sum8, sum8, 4);
198 return sum8[0] + sum8[4];
199}
200
Damjan Marionbf129f42018-06-27 13:03:26 +0200201static_always_inline u32x8
202u32x8_hadd (u32x8 v1, u32x8 v2)
203{
204 return (u32x8) _mm256_hadd_epi32 ((__m256i) v1, (__m256i) v2);
205}
206
Damjan Marion08bca802018-06-18 22:21:40 +0200207static_always_inline u16x16
208u16x16_mask_last (u16x16 v, u8 n_last)
209{
210 const u16x16 masks[17] = {
211 {0},
212 {-1},
213 {-1, -1},
214 {-1, -1, -1},
215 {-1, -1, -1, -1},
216 {-1, -1, -1, -1, -1},
217 {-1, -1, -1, -1, -1, -1},
218 {-1, -1, -1, -1, -1, -1, -1},
219 {-1, -1, -1, -1, -1, -1, -1, -1},
220 {-1, -1, -1, -1, -1, -1, -1, -1, -1},
221 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
222 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
223 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
224 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
225 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
226 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
227 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
228 };
229
230 ASSERT (n_last < 17);
231
232 return v & masks[16 - n_last];
233}
234
Damjan Marion69fdfee2018-10-06 14:33:18 +0200235static_always_inline f32x8
236f32x8_from_u32x8 (u32x8 v)
237{
238 return (f32x8) _mm256_cvtepi32_ps ((__m256i) v);
239}
240
241static_always_inline u32x8
242u32x8_from_f32x8 (f32x8 v)
243{
244 return (u32x8) _mm256_cvttps_epi32 ((__m256) v);
245}
246
Damjan Marionc899dac2019-04-16 18:41:01 +0200247#define u32x8_blend(a,b,m) \
248 (u32x8) _mm256_blend_epi32 ((__m256i) a, (__m256i) b, m)
249
Damjan Marion07243572018-11-20 10:06:57 +0100250#define u16x16_blend(v1, v2, mask) \
251 (u16x16) _mm256_blend_epi16 ((__m256i) (v1), (__m256i) (v2), mask)
252
253static_always_inline u64x4
254u64x4_gather (void *p0, void *p1, void *p2, void *p3)
255{
256 u64x4 r = {
257 *(u64 *) p0, *(u64 *) p1, *(u64 *) p2, *(u64 *) p3
258 };
259 return r;
260}
261
262static_always_inline u32x8
263u32x8_gather (void *p0, void *p1, void *p2, void *p3, void *p4, void *p5,
264 void *p6, void *p7)
265{
266 u32x8 r = {
267 *(u32 *) p0, *(u32 *) p1, *(u32 *) p2, *(u32 *) p3,
268 *(u32 *) p4, *(u32 *) p5, *(u32 *) p6, *(u32 *) p7,
269 };
270 return r;
271}
272
273
274static_always_inline void
275u64x4_scatter (u64x4 r, void *p0, void *p1, void *p2, void *p3)
276{
277 *(u64 *) p0 = r[0];
278 *(u64 *) p1 = r[1];
279 *(u64 *) p2 = r[2];
280 *(u64 *) p3 = r[3];
281}
282
283static_always_inline void
284u32x8_scatter (u32x8 r, void *p0, void *p1, void *p2, void *p3, void *p4,
285 void *p5, void *p6, void *p7)
286{
287 *(u32 *) p0 = r[0];
288 *(u32 *) p1 = r[1];
289 *(u32 *) p2 = r[2];
290 *(u32 *) p3 = r[3];
291 *(u32 *) p4 = r[4];
292 *(u32 *) p5 = r[5];
293 *(u32 *) p6 = r[6];
294 *(u32 *) p7 = r[7];
295}
296
297static_always_inline void
298u64x4_scatter_one (u64x4 r, int index, void *p)
299{
300 *(u64 *) p = r[index];
301}
302
303static_always_inline void
304u32x8_scatter_one (u32x8 r, int index, void *p)
305{
306 *(u32 *) p = r[index];
307}
308
Damjan Marionc59b9a22019-03-19 15:38:40 +0100309static_always_inline u8x32
310u8x32_is_greater (u8x32 v1, u8x32 v2)
311{
312 return (u8x32) _mm256_cmpgt_epi8 ((__m256i) v1, (__m256i) v2);
313}
314
315static_always_inline u8x32
316u8x32_blend (u8x32 v1, u8x32 v2, u8x32 mask)
317{
318 return (u8x32) _mm256_blendv_epi8 ((__m256i) v1, (__m256i) v2,
319 (__m256i) mask);
320}
321
Damjan Marion9f7e33d2019-04-08 10:14:51 +0200322#define u32x8_permute_lanes(a, b, m) \
323 (u32x8) _mm256_permute2x128_si256 ((__m256i) a, (__m256i) b, m)
324#define u64x4_permute_lanes(a, b, m) \
325 (u64x4) _mm256_permute2x128_si256 ((__m256i) a, (__m256i) b, m)
326
Damjan Marionc899dac2019-04-16 18:41:01 +0200327static_always_inline u32x8
328u32x8_min (u32x8 a, u32x8 b)
329{
330 return (u32x8) _mm256_min_epu32 ((__m256i) a, (__m256i) b);
331}
332
333static_always_inline u32
334u32x8_min_scalar (u32x8 v)
335{
336 return u32x4_min_scalar (u32x4_min (u32x8_extract_lo (v),
337 u32x8_extract_hi (v)));
338}
339
Damjan Marion9f7e33d2019-04-08 10:14:51 +0200340static_always_inline void
341u32x8_transpose (u32x8 a[8])
342{
343 u64x4 r[8], x, y;
344
345 r[0] = (u64x4) u32x8_interleave_lo (a[0], a[1]);
346 r[1] = (u64x4) u32x8_interleave_hi (a[0], a[1]);
347 r[2] = (u64x4) u32x8_interleave_lo (a[2], a[3]);
348 r[3] = (u64x4) u32x8_interleave_hi (a[2], a[3]);
349 r[4] = (u64x4) u32x8_interleave_lo (a[4], a[5]);
350 r[5] = (u64x4) u32x8_interleave_hi (a[4], a[5]);
351 r[6] = (u64x4) u32x8_interleave_lo (a[6], a[7]);
352 r[7] = (u64x4) u32x8_interleave_hi (a[6], a[7]);
353
354 x = u64x4_interleave_lo (r[0], r[2]);
355 y = u64x4_interleave_lo (r[4], r[6]);
356 a[0] = u32x8_permute_lanes (x, y, 0x20);
357 a[4] = u32x8_permute_lanes (x, y, 0x31);
358
359 x = u64x4_interleave_hi (r[0], r[2]);
360 y = u64x4_interleave_hi (r[4], r[6]);
361 a[1] = u32x8_permute_lanes (x, y, 0x20);
362 a[5] = u32x8_permute_lanes (x, y, 0x31);
363
364 x = u64x4_interleave_lo (r[1], r[3]);
365 y = u64x4_interleave_lo (r[5], r[7]);
366 a[2] = u32x8_permute_lanes (x, y, 0x20);
367 a[6] = u32x8_permute_lanes (x, y, 0x31);
368
369 x = u64x4_interleave_hi (r[1], r[3]);
370 y = u64x4_interleave_hi (r[5], r[7]);
371 a[3] = u32x8_permute_lanes (x, y, 0x20);
372 a[7] = u32x8_permute_lanes (x, y, 0x31);
373}
374
375static_always_inline void
376u64x4_transpose (u64x4 a[8])
377{
378 u64x4 r[4];
379
380 r[0] = u64x4_interleave_lo (a[0], a[1]);
381 r[1] = u64x4_interleave_hi (a[0], a[1]);
382 r[2] = u64x4_interleave_lo (a[2], a[3]);
383 r[3] = u64x4_interleave_hi (a[2], a[3]);
384
385 a[0] = u64x4_permute_lanes (r[0], r[2], 0x20);
386 a[1] = u64x4_permute_lanes (r[1], r[3], 0x20);
387 a[2] = u64x4_permute_lanes (r[0], r[2], 0x31);
388 a[3] = u64x4_permute_lanes (r[1], r[3], 0x31);
389}
390
Damjan Marionc5766222018-04-16 00:18:34 +0200391#endif /* included_vector_avx2_h */
392
393/*
394 * fd.io coding-style-patch-verification: ON
395 *
396 * Local Variables:
397 * eval: (c-set-style "gnu")
398 * End:
399 */