blob: 0665a23fe772f096dbd4cb9bb8841d78b61aa3ef [file] [log] [blame]
Damjan Marionc5766222018-04-16 00:18:34 +02001/*
2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#ifndef included_vector_avx512_h
17#define included_vector_avx512_h
18
19#include <vppinfra/clib.h>
20#include <x86intrin.h>
21
Damjan Mariona52e1662018-05-19 00:04:23 +020022/* *INDENT-OFF* */
Damjan Marionc5766222018-04-16 00:18:34 +020023#define foreach_avx512_vec512i \
24 _(i,8,64,epi8) _(i,16,32,epi16) _(i,32,16,epi32) _(i,64,8,epi64)
25#define foreach_avx512_vec512u \
26 _(u,8,64,epi8) _(u,16,32,epi16) _(u,32,16,epi32) _(u,64,8,epi64)
27#define foreach_avx512_vec512f \
28 _(f,32,8,ps) _(f,64,4,pd)
29
Damjan Mariona52e1662018-05-19 00:04:23 +020030/* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
Damjan Marion1cf9a162018-05-23 20:21:51 +020031 is_all_equal, is_zero_mask */
Damjan Marionc5766222018-04-16 00:18:34 +020032#define _(t, s, c, i) \
33static_always_inline t##s##x##c \
34t##s##x##c##_splat (t##s x) \
35{ return (t##s##x##c) _mm512_set1_##i (x); } \
36\
37static_always_inline t##s##x##c \
38t##s##x##c##_load_unaligned (void *p) \
39{ return (t##s##x##c) _mm512_loadu_si512 (p); } \
40\
41static_always_inline void \
42t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
43{ _mm512_storeu_si512 ((__m512i *) p, (__m512i) v); } \
44\
Damjan Mariona52e1662018-05-19 00:04:23 +020045static_always_inline int \
46t##s##x##c##_is_all_zero (t##s##x##c v) \
47{ return (_mm512_test_epi64_mask ((__m512i) v, (__m512i) v) == 0); } \
48\
49static_always_inline int \
50t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
Damjan Marion14864772018-05-22 14:07:47 +020051{ return t##s##x##c##_is_all_zero (a ^ b); } \
Damjan Mariona52e1662018-05-19 00:04:23 +020052\
53static_always_inline int \
54t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
55{ return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); } \
Damjan Marion1cf9a162018-05-23 20:21:51 +020056\
57static_always_inline u##c \
58t##s##x##c##_is_zero_mask (t##s##x##c v) \
59{ return _mm512_test_##i##_mask ((__m512i) v, (__m512i) v); } \
Damjan Marion2cd8ad42019-04-17 16:05:54 +020060\
61static_always_inline t##s##x##c \
62t##s##x##c##_interleave_lo (t##s##x##c a, t##s##x##c b) \
63{ return (t##s##x##c) _mm512_unpacklo_##i ((__m512i) a, (__m512i) b); } \
64\
65static_always_inline t##s##x##c \
66t##s##x##c##_interleave_hi (t##s##x##c a, t##s##x##c b) \
67{ return (t##s##x##c) _mm512_unpackhi_##i ((__m512i) a, (__m512i) b); } \
Damjan Marion1cf9a162018-05-23 20:21:51 +020068
Damjan Marionc5766222018-04-16 00:18:34 +020069
70foreach_avx512_vec512i foreach_avx512_vec512u
71#undef _
Damjan Marion8c3f8a22018-05-17 21:12:13 +020072/* *INDENT-ON* */
73
74static_always_inline u32
75u16x32_msb_mask (u16x32 v)
76{
77 return (u32) _mm512_movepi16_mask ((__m512i) v);
78}
79
Damjan Marionc899dac2019-04-16 18:41:01 +020080static_always_inline u32x16
81u32x16_byte_swap (u32x16 v)
82{
83 u8x64 swap = {
84 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
85 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
86 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
87 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
88 };
89 return (u32x16) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
90}
91
92static_always_inline u16x32
93u16x32_byte_swap (u16x32 v)
94{
95 u8x64 swap = {
96 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
97 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
98 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
99 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
100 };
101 return (u16x32) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
102}
103
104static_always_inline u32x8
105u32x16_extract_lo (u32x16 v)
106{
107 return (u32x8) _mm512_extracti64x4_epi64 ((__m512i) v, 0);
108}
109
110static_always_inline u32x8
111u32x16_extract_hi (u32x16 v)
112{
113 return (u32x8) _mm512_extracti64x4_epi64 ((__m512i) v, 1);
114}
115
Damjan Marion627fb6a2020-02-16 13:07:13 +0100116static_always_inline u8x32
117u8x64_extract_lo (u8x64 v)
118{
119 return (u8x32) _mm512_extracti64x4_epi64 ((__m512i) v, 0);
120}
121
122static_always_inline u8x32
123u8x64_extract_hi (u8x64 v)
124{
125 return (u8x32) _mm512_extracti64x4_epi64 ((__m512i) v, 1);
126}
127
Damjan Marionc899dac2019-04-16 18:41:01 +0200128static_always_inline u32
129u32x16_min_scalar (u32x16 v)
130{
131 return u32x8_min_scalar (u32x8_min (u32x16_extract_lo (v),
132 u32x16_extract_hi (v)));
133}
134
Damjan Marion2cd8ad42019-04-17 16:05:54 +0200135static_always_inline u32x16
136u32x16_insert_lo (u32x16 r, u32x8 v)
137{
138 return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 0);
139}
140
141static_always_inline u32x16
142u32x16_insert_hi (u32x16 r, u32x8 v)
143{
144 return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 1);
145}
146
147static_always_inline u64x8
148u64x8_permute (u64x8 a, u64x8 b, u64x8 mask)
149{
150 return (u64x8) _mm512_permutex2var_epi64 ((__m512i) a, (__m512i) mask,
151 (__m512i) b);
152}
153
Damjan Marionc899dac2019-04-16 18:41:01 +0200154
155#define u32x16_ternary_logic(a, b, c, d) \
156 (u32x16) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b, (__m512i) c, d)
Damjan Marion4e083162019-04-12 17:44:35 +0200157
Damjan Marione84e9d72020-02-13 13:11:02 +0100158#define u8x64_insert_u8x16(a, b, n) \
159 (u8x64) _mm512_inserti64x2 ((__m512i) (a), (__m128i) (b), n)
160
161#define u8x64_extract_u8x16(a, n) \
162 (u8x16) _mm512_extracti64x2_epi64 ((__m512i) (a), n)
163
Damjan Marion627fb6a2020-02-16 13:07:13 +0100164#define u8x64_word_shift_left(a,n) (u8x64) _mm512_bslli_epi128((__m512i) a, n)
165#define u8x64_word_shift_right(a,n) (u8x64) _mm512_bsrli_epi128((__m512i) a, n)
166
Damjan Marionf75defa2020-02-13 18:14:06 +0100167static_always_inline u8x64
168u8x64_xor3 (u8x64 a, u8x64 b, u8x64 c)
169{
170 return (u8x64) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b,
171 (__m512i) c, 0x96);
172}
173
Damjan Marion47d8f5d2020-02-25 11:51:48 +0100174static_always_inline u8x64
175u8x64_reflect_u8x16 (u8x64 x)
176{
177 static const u8x64 mask = {
178 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
179 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
180 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
181 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
182 };
183 return (u8x64) _mm512_shuffle_epi8 ((__m512i) x, (__m512i) mask);
184}
185
186static_always_inline u8x64
187u8x64_mask_load (u8x64 a, void *p, u64 mask)
188{
189 return (u8x64) _mm512_mask_loadu_epi8 ((__m512i) a, mask, p);
190}
191
192static_always_inline void
193u8x64_mask_store (u8x64 a, void *p, u64 mask)
194{
195 _mm512_mask_storeu_epi8 (p, mask, (__m512i) a);
196}
197
198static_always_inline u8x64
199u8x64_splat_u8x16 (u8x16 a)
200{
201 return (u8x64) _mm512_broadcast_i64x2 ((__m128i) a);
202}
203
204static_always_inline u32x16
205u32x16_splat_u32x4 (u32x4 a)
206{
207 return (u32x16) _mm512_broadcast_i64x2 ((__m128i) a);
208}
209
210static_always_inline u32x16
211u32x16_mask_blend (u32x16 a, u32x16 b, u16 mask)
212{
213 return (u32x16) _mm512_mask_blend_epi32 (mask, (__m512i) a, (__m512i) b);
214}
215
216static_always_inline u8x64
217u8x64_mask_blend (u8x64 a, u8x64 b, u64 mask)
218{
219 return (u8x64) _mm512_mask_blend_epi8 (mask, (__m512i) a, (__m512i) b);
220}
221
Damjan Marion4e083162019-04-12 17:44:35 +0200222static_always_inline void
223u32x16_transpose (u32x16 m[16])
224{
225 __m512i r[16], a, b, c, d, x, y;
226
227 /* *INDENT-OFF* */
228 __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
229 __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
230 __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
231 __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
232 /* *INDENT-ON* */
233
234 r[0] = _mm512_unpacklo_epi32 ((__m512i) m[0], (__m512i) m[1]);
235 r[1] = _mm512_unpacklo_epi32 ((__m512i) m[2], (__m512i) m[3]);
236 r[2] = _mm512_unpacklo_epi32 ((__m512i) m[4], (__m512i) m[5]);
237 r[3] = _mm512_unpacklo_epi32 ((__m512i) m[6], (__m512i) m[7]);
238 r[4] = _mm512_unpacklo_epi32 ((__m512i) m[8], (__m512i) m[9]);
239 r[5] = _mm512_unpacklo_epi32 ((__m512i) m[10], (__m512i) m[11]);
240 r[6] = _mm512_unpacklo_epi32 ((__m512i) m[12], (__m512i) m[13]);
241 r[7] = _mm512_unpacklo_epi32 ((__m512i) m[14], (__m512i) m[15]);
242
243 r[8] = _mm512_unpackhi_epi32 ((__m512i) m[0], (__m512i) m[1]);
244 r[9] = _mm512_unpackhi_epi32 ((__m512i) m[2], (__m512i) m[3]);
245 r[10] = _mm512_unpackhi_epi32 ((__m512i) m[4], (__m512i) m[5]);
246 r[11] = _mm512_unpackhi_epi32 ((__m512i) m[6], (__m512i) m[7]);
247 r[12] = _mm512_unpackhi_epi32 ((__m512i) m[8], (__m512i) m[9]);
248 r[13] = _mm512_unpackhi_epi32 ((__m512i) m[10], (__m512i) m[11]);
249 r[14] = _mm512_unpackhi_epi32 ((__m512i) m[12], (__m512i) m[13]);
250 r[15] = _mm512_unpackhi_epi32 ((__m512i) m[14], (__m512i) m[15]);
251
252 a = _mm512_unpacklo_epi64 (r[0], r[1]);
253 b = _mm512_unpacklo_epi64 (r[2], r[3]);
254 c = _mm512_unpacklo_epi64 (r[4], r[5]);
255 d = _mm512_unpacklo_epi64 (r[6], r[7]);
256 x = _mm512_permutex2var_epi64 (a, pm1, b);
257 y = _mm512_permutex2var_epi64 (c, pm1, d);
258 m[0] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
259 m[8] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
260 x = _mm512_permutex2var_epi64 (a, pm2, b);
261 y = _mm512_permutex2var_epi64 (c, pm2, d);
262 m[4] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
263 m[12] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
264
265 a = _mm512_unpacklo_epi64 (r[8], r[9]);
266 b = _mm512_unpacklo_epi64 (r[10], r[11]);
267 c = _mm512_unpacklo_epi64 (r[12], r[13]);
268 d = _mm512_unpacklo_epi64 (r[14], r[15]);
269 x = _mm512_permutex2var_epi64 (a, pm1, b);
270 y = _mm512_permutex2var_epi64 (c, pm1, d);
271 m[2] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
272 m[10] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
273 x = _mm512_permutex2var_epi64 (a, pm2, b);
274 y = _mm512_permutex2var_epi64 (c, pm2, d);
275 m[6] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
276 m[14] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
277
278 a = _mm512_unpackhi_epi64 (r[0], r[1]);
279 b = _mm512_unpackhi_epi64 (r[2], r[3]);
280 c = _mm512_unpackhi_epi64 (r[4], r[5]);
281 d = _mm512_unpackhi_epi64 (r[6], r[7]);
282 x = _mm512_permutex2var_epi64 (a, pm1, b);
283 y = _mm512_permutex2var_epi64 (c, pm1, d);
284 m[1] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
285 m[9] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
286 x = _mm512_permutex2var_epi64 (a, pm2, b);
287 y = _mm512_permutex2var_epi64 (c, pm2, d);
288 m[5] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
289 m[13] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
290
291 a = _mm512_unpackhi_epi64 (r[8], r[9]);
292 b = _mm512_unpackhi_epi64 (r[10], r[11]);
293 c = _mm512_unpackhi_epi64 (r[12], r[13]);
294 d = _mm512_unpackhi_epi64 (r[14], r[15]);
295 x = _mm512_permutex2var_epi64 (a, pm1, b);
296 y = _mm512_permutex2var_epi64 (c, pm1, d);
297 m[3] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
298 m[11] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
299 x = _mm512_permutex2var_epi64 (a, pm2, b);
300 y = _mm512_permutex2var_epi64 (c, pm2, d);
301 m[7] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
302 m[15] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
303}
304
305
306
307static_always_inline void
308u64x8_transpose (u64x8 m[8])
309{
310 __m512i r[8], x, y;
311
312 /* *INDENT-OFF* */
313 __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
314 __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
315 __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
316 __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
317 /* *INDENT-ON* */
318
319 r[0] = _mm512_unpacklo_epi64 ((__m512i) m[0], (__m512i) m[1]);
320 r[1] = _mm512_unpacklo_epi64 ((__m512i) m[2], (__m512i) m[3]);
321 r[2] = _mm512_unpacklo_epi64 ((__m512i) m[4], (__m512i) m[5]);
322 r[3] = _mm512_unpacklo_epi64 ((__m512i) m[6], (__m512i) m[7]);
323 r[4] = _mm512_unpackhi_epi64 ((__m512i) m[0], (__m512i) m[1]);
324 r[5] = _mm512_unpackhi_epi64 ((__m512i) m[2], (__m512i) m[3]);
325 r[6] = _mm512_unpackhi_epi64 ((__m512i) m[4], (__m512i) m[5]);
326 r[7] = _mm512_unpackhi_epi64 ((__m512i) m[6], (__m512i) m[7]);
327
328 x = _mm512_permutex2var_epi64 (r[0], pm1, r[1]);
329 y = _mm512_permutex2var_epi64 (r[2], pm1, r[3]);
330 m[0] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
331 m[4] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
332 x = _mm512_permutex2var_epi64 (r[0], pm2, r[1]);
333 y = _mm512_permutex2var_epi64 (r[2], pm2, r[3]);
334 m[2] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
335 m[6] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
336
337 x = _mm512_permutex2var_epi64 (r[4], pm1, r[5]);
338 y = _mm512_permutex2var_epi64 (r[6], pm1, r[7]);
339 m[1] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
340 m[5] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
341 x = _mm512_permutex2var_epi64 (r[4], pm2, r[5]);
342 y = _mm512_permutex2var_epi64 (r[6], pm2, r[7]);
343 m[3] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
344 m[7] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
345}
346
Damjan Marion8c3f8a22018-05-17 21:12:13 +0200347#endif /* included_vector_avx512_h */
Damjan Marionc5766222018-04-16 00:18:34 +0200348/*
349 * fd.io coding-style-patch-verification: ON
350 *
351 * Local Variables:
352 * eval: (c-set-style "gnu")
353 * End:
354 */