blob: 7226c230e68a1fc48ebc0550b32a352fd5592c28 [file] [log] [blame]
Damjan Marionc5766222018-04-16 00:18:34 +02001/*
2 * Copyright (c) 2018 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#ifndef included_vector_avx2_h
17#define included_vector_avx2_h
18
19#include <vppinfra/clib.h>
20#include <x86intrin.h>
21
Damjan Mariona52e1662018-05-19 00:04:23 +020022/* *INDENT-OFF* */
Damjan Marionc5766222018-04-16 00:18:34 +020023#define foreach_avx2_vec256i \
Damjan Marion4fce7f72018-07-16 14:18:23 +020024 _(i,8,32,epi8) _(i,16,16,epi16) _(i,32,8,epi32) _(i,64,4,epi64)
Damjan Marionc5766222018-04-16 00:18:34 +020025#define foreach_avx2_vec256u \
Damjan Marion4fce7f72018-07-16 14:18:23 +020026 _(u,8,32,epi8) _(u,16,16,epi16) _(u,32,8,epi32) _(u,64,4,epi64)
Damjan Marionc5766222018-04-16 00:18:34 +020027#define foreach_avx2_vec256f \
28 _(f,32,8,ps) _(f,64,4,pd)
29
Damjan Marion4fce7f72018-07-16 14:18:23 +020030#define _mm256_set1_epi64 _mm256_set1_epi64x
31
Damjan Mariona52e1662018-05-19 00:04:23 +020032/* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
33 is_all_equal */
Damjan Marionc5766222018-04-16 00:18:34 +020034#define _(t, s, c, i) \
35static_always_inline t##s##x##c \
36t##s##x##c##_splat (t##s x) \
37{ return (t##s##x##c) _mm256_set1_##i (x); } \
38\
39static_always_inline t##s##x##c \
40t##s##x##c##_load_unaligned (void *p) \
41{ return (t##s##x##c) _mm256_loadu_si256 (p); } \
42\
43static_always_inline void \
44t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
45{ _mm256_storeu_si256 ((__m256i *) p, (__m256i) v); } \
46\
47static_always_inline int \
48t##s##x##c##_is_all_zero (t##s##x##c x) \
49{ return _mm256_testz_si256 ((__m256i) x, (__m256i) x); } \
50\
51static_always_inline int \
Damjan Marion14864772018-05-22 14:07:47 +020052t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
53{ return t##s##x##c##_is_all_zero (a ^ b); } \
Damjan Marionc5766222018-04-16 00:18:34 +020054\
Damjan Mariona52e1662018-05-19 00:04:23 +020055static_always_inline int \
56t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
Damjan Marion4fce7f72018-07-16 14:18:23 +020057{ return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); } \
58\
59static_always_inline t##s##x##c \
60t##s##x##c##_interleave_lo (t##s##x##c a, t##s##x##c b) \
61{ return (t##s##x##c) _mm256_unpacklo_##i ((__m256i) a, (__m256i) b); } \
62\
63static_always_inline t##s##x##c \
64t##s##x##c##_interleave_hi (t##s##x##c a, t##s##x##c b) \
65{ return (t##s##x##c) _mm256_unpackhi_##i ((__m256i) a, (__m256i) b); } \
66
Damjan Marionc5766222018-04-16 00:18:34 +020067
68foreach_avx2_vec256i foreach_avx2_vec256u
69#undef _
Damjan Mariona52e1662018-05-19 00:04:23 +020070/* *INDENT-ON* */
71
72always_inline u32x8
Damjan Marionc5766222018-04-16 00:18:34 +020073u32x8_permute (u32x8 v, u32x8 idx)
74{
75 return (u32x8) _mm256_permutevar8x32_epi32 ((__m256i) v, (__m256i) idx);
76}
77
Damjan Marionef0bac72021-04-22 18:08:28 +020078#define u64x4_permute(v, m0, m1, m2, m3) \
79 (u64x4) _mm256_permute4x64_epi64 ( \
80 (__m256i) v, ((m0) | (m1) << 2 | (m2) << 4 | (m3) << 6))
81
Damjan Marion1cf9a162018-05-23 20:21:51 +020082/* _extract_lo, _extract_hi */
83/* *INDENT-OFF* */
84#define _(t1,t2) \
85always_inline t1 \
86t2##_extract_lo (t2 v) \
87{ return (t1) _mm256_extracti128_si256 ((__m256i) v, 0); } \
88\
89always_inline t1 \
90t2##_extract_hi (t2 v) \
91{ return (t1) _mm256_extracti128_si256 ((__m256i) v, 1); } \
92\
93always_inline t2 \
94t2##_insert_lo (t2 v1, t1 v2) \
95{ return (t2) _mm256_inserti128_si256 ((__m256i) v1, (__m128i) v2, 0); }\
96\
97always_inline t2 \
98t2##_insert_hi (t2 v1, t1 v2) \
99{ return (t2) _mm256_inserti128_si256 ((__m256i) v1, (__m128i) v2, 1); }\
Damjan Marionc5766222018-04-16 00:18:34 +0200100
Damjan Marion1cf9a162018-05-23 20:21:51 +0200101_(u8x16, u8x32)
102_(u16x8, u16x16)
103_(u32x4, u32x8)
104_(u64x2, u64x4)
105#undef _
106/* *INDENT-ON* */
Damjan Marionc5766222018-04-16 00:18:34 +0200107
Damjan Marion7d14aad2021-05-05 19:31:41 +0200108/* 256 bit packs. */
109#define _(f, t, fn) \
110 always_inline t t##_pack (f lo, f hi) \
111 { \
112 return (t) fn ((__m256i) lo, (__m256i) hi); \
113 }
Damjan Marionee7f0bd2018-05-05 12:30:28 +0200114
Damjan Marion7d14aad2021-05-05 19:31:41 +0200115_ (i16x16, i8x32, _mm256_packs_epi16)
116_ (i16x16, u8x32, _mm256_packus_epi16)
117_ (i32x8, i16x16, _mm256_packs_epi32)
118_ (i32x8, u16x16, _mm256_packus_epi32)
119
120#undef _
Damjan Marionee7f0bd2018-05-05 12:30:28 +0200121
Damjan Marion8c3f8a22018-05-17 21:12:13 +0200122static_always_inline u32
123u8x32_msb_mask (u8x32 v)
124{
125 return _mm256_movemask_epi8 ((__m256i) v);
126}
127
Damjan Marionef0bac72021-04-22 18:08:28 +0200128static_always_inline u32
129i8x32_msb_mask (i8x32 v)
130{
131 return _mm256_movemask_epi8 ((__m256i) v);
132}
133
Damjan Marion90d05bc2020-08-31 17:18:26 +0200134/* _from_ */
Damjan Marionafe56de2018-05-17 12:44:00 +0200135/* *INDENT-OFF* */
136#define _(f,t,i) \
137static_always_inline t \
Damjan Marion90d05bc2020-08-31 17:18:26 +0200138t##_from_##f (f x) \
Damjan Marionafe56de2018-05-17 12:44:00 +0200139{ return (t) _mm256_cvt##i ((__m128i) x); }
140
141_(u16x8, u32x8, epu16_epi32)
142_(u16x8, u64x4, epu16_epi64)
143_(u32x4, u64x4, epu32_epi64)
Lijian.Zhang7e9d5ff2021-04-14 16:12:28 +0800144_ (u8x16, u16x16, epu8_epi16)
Damjan Marionafe56de2018-05-17 12:44:00 +0200145_(u8x16, u32x8, epu8_epi32)
146_(u8x16, u64x4, epu8_epi64)
147_(i16x8, i32x8, epi16_epi32)
148_(i16x8, i64x4, epi16_epi64)
149_(i32x4, i64x4, epi32_epi64)
Lijian.Zhang7e9d5ff2021-04-14 16:12:28 +0800150_ (i8x16, i16x16, epi8_epi16)
Damjan Marionafe56de2018-05-17 12:44:00 +0200151_(i8x16, i32x8, epi8_epi32)
152_(i8x16, i64x4, epi8_epi64)
153#undef _
154/* *INDENT-ON* */
155
Damjan Mariondd648aa2020-03-12 11:56:00 +0100156static_always_inline u64x4
157u64x4_byte_swap (u64x4 v)
158{
159 u8x32 swap = {
160 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
161 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
162 };
163 return (u64x4) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
164}
165
Damjan Marionc899dac2019-04-16 18:41:01 +0200166static_always_inline u32x8
167u32x8_byte_swap (u32x8 v)
168{
169 u8x32 swap = {
170 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
171 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
172 };
173 return (u32x8) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
174}
175
Damjan Marionbf129f42018-06-27 13:03:26 +0200176static_always_inline u16x16
177u16x16_byte_swap (u16x16 v)
178{
179 u8x32 swap = {
180 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
181 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
182 };
183 return (u16x16) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
184}
185
Damjan Marion94dbf952020-07-15 20:18:39 +0200186static_always_inline u8x32
187u8x32_shuffle (u8x32 v, u8x32 m)
188{
189 return (u8x32) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) m);
190}
191
192#define u8x32_align_right(a, b, imm) \
193 (u8x32) _mm256_alignr_epi8 ((__m256i) a, (__m256i) b, imm)
194
Damjan Marion4dc098f2021-09-22 15:28:29 +0200195#define u64x4_align_right(a, b, imm) \
196 (u64x4) _mm256_alignr_epi64 ((__m256i) a, (__m256i) b, imm)
197
Damjan Marion94dbf952020-07-15 20:18:39 +0200198static_always_inline u32
199u32x8_sum_elts (u32x8 sum8)
200{
201 sum8 += (u32x8) u8x32_align_right (sum8, sum8, 8);
202 sum8 += (u32x8) u8x32_align_right (sum8, sum8, 4);
203 return sum8[0] + sum8[4];
204}
205
Damjan Marionbf129f42018-06-27 13:03:26 +0200206static_always_inline u32x8
207u32x8_hadd (u32x8 v1, u32x8 v2)
208{
209 return (u32x8) _mm256_hadd_epi32 ((__m256i) v1, (__m256i) v2);
210}
211
Damjan Marion08bca802018-06-18 22:21:40 +0200212static_always_inline u16x16
213u16x16_mask_last (u16x16 v, u8 n_last)
214{
215 const u16x16 masks[17] = {
216 {0},
217 {-1},
218 {-1, -1},
219 {-1, -1, -1},
220 {-1, -1, -1, -1},
221 {-1, -1, -1, -1, -1},
222 {-1, -1, -1, -1, -1, -1},
223 {-1, -1, -1, -1, -1, -1, -1},
224 {-1, -1, -1, -1, -1, -1, -1, -1},
225 {-1, -1, -1, -1, -1, -1, -1, -1, -1},
226 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
227 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
228 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
229 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
230 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
231 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
232 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
233 };
234
235 ASSERT (n_last < 17);
236
237 return v & masks[16 - n_last];
238}
239
Damjan Marion69fdfee2018-10-06 14:33:18 +0200240static_always_inline f32x8
241f32x8_from_u32x8 (u32x8 v)
242{
243 return (f32x8) _mm256_cvtepi32_ps ((__m256i) v);
244}
245
246static_always_inline u32x8
247u32x8_from_f32x8 (f32x8 v)
248{
249 return (u32x8) _mm256_cvttps_epi32 ((__m256) v);
250}
251
Damjan Marionc899dac2019-04-16 18:41:01 +0200252#define u32x8_blend(a,b,m) \
253 (u32x8) _mm256_blend_epi32 ((__m256i) a, (__m256i) b, m)
254
Damjan Marion07243572018-11-20 10:06:57 +0100255#define u16x16_blend(v1, v2, mask) \
256 (u16x16) _mm256_blend_epi16 ((__m256i) (v1), (__m256i) (v2), mask)
257
258static_always_inline u64x4
259u64x4_gather (void *p0, void *p1, void *p2, void *p3)
260{
261 u64x4 r = {
262 *(u64 *) p0, *(u64 *) p1, *(u64 *) p2, *(u64 *) p3
263 };
264 return r;
265}
266
267static_always_inline u32x8
268u32x8_gather (void *p0, void *p1, void *p2, void *p3, void *p4, void *p5,
269 void *p6, void *p7)
270{
271 u32x8 r = {
272 *(u32 *) p0, *(u32 *) p1, *(u32 *) p2, *(u32 *) p3,
273 *(u32 *) p4, *(u32 *) p5, *(u32 *) p6, *(u32 *) p7,
274 };
275 return r;
276}
277
278
279static_always_inline void
280u64x4_scatter (u64x4 r, void *p0, void *p1, void *p2, void *p3)
281{
282 *(u64 *) p0 = r[0];
283 *(u64 *) p1 = r[1];
284 *(u64 *) p2 = r[2];
285 *(u64 *) p3 = r[3];
286}
287
288static_always_inline void
289u32x8_scatter (u32x8 r, void *p0, void *p1, void *p2, void *p3, void *p4,
290 void *p5, void *p6, void *p7)
291{
292 *(u32 *) p0 = r[0];
293 *(u32 *) p1 = r[1];
294 *(u32 *) p2 = r[2];
295 *(u32 *) p3 = r[3];
296 *(u32 *) p4 = r[4];
297 *(u32 *) p5 = r[5];
298 *(u32 *) p6 = r[6];
299 *(u32 *) p7 = r[7];
300}
301
302static_always_inline void
303u64x4_scatter_one (u64x4 r, int index, void *p)
304{
305 *(u64 *) p = r[index];
306}
307
308static_always_inline void
309u32x8_scatter_one (u32x8 r, int index, void *p)
310{
311 *(u32 *) p = r[index];
312}
313
Damjan Marionc59b9a22019-03-19 15:38:40 +0100314static_always_inline u8x32
315u8x32_is_greater (u8x32 v1, u8x32 v2)
316{
317 return (u8x32) _mm256_cmpgt_epi8 ((__m256i) v1, (__m256i) v2);
318}
319
320static_always_inline u8x32
321u8x32_blend (u8x32 v1, u8x32 v2, u8x32 mask)
322{
323 return (u8x32) _mm256_blendv_epi8 ((__m256i) v1, (__m256i) v2,
324 (__m256i) mask);
325}
326
Damjan Marion9f7e33d2019-04-08 10:14:51 +0200327#define u32x8_permute_lanes(a, b, m) \
328 (u32x8) _mm256_permute2x128_si256 ((__m256i) a, (__m256i) b, m)
329#define u64x4_permute_lanes(a, b, m) \
330 (u64x4) _mm256_permute2x128_si256 ((__m256i) a, (__m256i) b, m)
331
Damjan Marionc899dac2019-04-16 18:41:01 +0200332static_always_inline u32x8
333u32x8_min (u32x8 a, u32x8 b)
334{
335 return (u32x8) _mm256_min_epu32 ((__m256i) a, (__m256i) b);
336}
337
338static_always_inline u32
339u32x8_min_scalar (u32x8 v)
340{
341 return u32x4_min_scalar (u32x4_min (u32x8_extract_lo (v),
342 u32x8_extract_hi (v)));
343}
344
Damjan Marion9f7e33d2019-04-08 10:14:51 +0200345static_always_inline void
346u32x8_transpose (u32x8 a[8])
347{
348 u64x4 r[8], x, y;
349
350 r[0] = (u64x4) u32x8_interleave_lo (a[0], a[1]);
351 r[1] = (u64x4) u32x8_interleave_hi (a[0], a[1]);
352 r[2] = (u64x4) u32x8_interleave_lo (a[2], a[3]);
353 r[3] = (u64x4) u32x8_interleave_hi (a[2], a[3]);
354 r[4] = (u64x4) u32x8_interleave_lo (a[4], a[5]);
355 r[5] = (u64x4) u32x8_interleave_hi (a[4], a[5]);
356 r[6] = (u64x4) u32x8_interleave_lo (a[6], a[7]);
357 r[7] = (u64x4) u32x8_interleave_hi (a[6], a[7]);
358
359 x = u64x4_interleave_lo (r[0], r[2]);
360 y = u64x4_interleave_lo (r[4], r[6]);
361 a[0] = u32x8_permute_lanes (x, y, 0x20);
362 a[4] = u32x8_permute_lanes (x, y, 0x31);
363
364 x = u64x4_interleave_hi (r[0], r[2]);
365 y = u64x4_interleave_hi (r[4], r[6]);
366 a[1] = u32x8_permute_lanes (x, y, 0x20);
367 a[5] = u32x8_permute_lanes (x, y, 0x31);
368
369 x = u64x4_interleave_lo (r[1], r[3]);
370 y = u64x4_interleave_lo (r[5], r[7]);
371 a[2] = u32x8_permute_lanes (x, y, 0x20);
372 a[6] = u32x8_permute_lanes (x, y, 0x31);
373
374 x = u64x4_interleave_hi (r[1], r[3]);
375 y = u64x4_interleave_hi (r[5], r[7]);
376 a[3] = u32x8_permute_lanes (x, y, 0x20);
377 a[7] = u32x8_permute_lanes (x, y, 0x31);
378}
379
380static_always_inline void
381u64x4_transpose (u64x4 a[8])
382{
383 u64x4 r[4];
384
385 r[0] = u64x4_interleave_lo (a[0], a[1]);
386 r[1] = u64x4_interleave_hi (a[0], a[1]);
387 r[2] = u64x4_interleave_lo (a[2], a[3]);
388 r[3] = u64x4_interleave_hi (a[2], a[3]);
389
390 a[0] = u64x4_permute_lanes (r[0], r[2], 0x20);
391 a[1] = u64x4_permute_lanes (r[1], r[3], 0x20);
392 a[2] = u64x4_permute_lanes (r[0], r[2], 0x31);
393 a[3] = u64x4_permute_lanes (r[1], r[3], 0x31);
394}
395
Damjan Marionc5766222018-04-16 00:18:34 +0200396#endif /* included_vector_avx2_h */
397
398/*
399 * fd.io coding-style-patch-verification: ON
400 *
401 * Local Variables:
402 * eval: (c-set-style "gnu")
403 * End:
404 */