blob: 482bdd515c9c0c306d7d6853360d10e04c1698ac [file] [log] [blame]
Damjan Marionc5766222018-04-16 00:18:34 +02001/*
2 * Copyright (c) 2018 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#ifndef included_vector_avx2_h
17#define included_vector_avx2_h
18
19#include <vppinfra/clib.h>
20#include <x86intrin.h>
21
Damjan Mariona52e1662018-05-19 00:04:23 +020022/* *INDENT-OFF* */
Damjan Marionc5766222018-04-16 00:18:34 +020023#define foreach_avx2_vec256i \
Damjan Marion4fce7f72018-07-16 14:18:23 +020024 _(i,8,32,epi8) _(i,16,16,epi16) _(i,32,8,epi32) _(i,64,4,epi64)
Damjan Marionc5766222018-04-16 00:18:34 +020025#define foreach_avx2_vec256u \
Damjan Marion4fce7f72018-07-16 14:18:23 +020026 _(u,8,32,epi8) _(u,16,16,epi16) _(u,32,8,epi32) _(u,64,4,epi64)
Damjan Marionc5766222018-04-16 00:18:34 +020027#define foreach_avx2_vec256f \
28 _(f,32,8,ps) _(f,64,4,pd)
29
Damjan Marion4fce7f72018-07-16 14:18:23 +020030#define _mm256_set1_epi64 _mm256_set1_epi64x
31
Damjan Mariona52e1662018-05-19 00:04:23 +020032/* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
33 is_all_equal */
Damjan Marionc5766222018-04-16 00:18:34 +020034#define _(t, s, c, i) \
35static_always_inline t##s##x##c \
36t##s##x##c##_splat (t##s x) \
37{ return (t##s##x##c) _mm256_set1_##i (x); } \
38\
39static_always_inline t##s##x##c \
40t##s##x##c##_load_unaligned (void *p) \
41{ return (t##s##x##c) _mm256_loadu_si256 (p); } \
42\
43static_always_inline void \
44t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
45{ _mm256_storeu_si256 ((__m256i *) p, (__m256i) v); } \
46\
47static_always_inline int \
48t##s##x##c##_is_all_zero (t##s##x##c x) \
49{ return _mm256_testz_si256 ((__m256i) x, (__m256i) x); } \
50\
51static_always_inline int \
Damjan Marion14864772018-05-22 14:07:47 +020052t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
53{ return t##s##x##c##_is_all_zero (a ^ b); } \
Damjan Marionc5766222018-04-16 00:18:34 +020054\
Damjan Mariona52e1662018-05-19 00:04:23 +020055static_always_inline int \
56t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
Damjan Marion4fce7f72018-07-16 14:18:23 +020057{ return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); } \
58\
59static_always_inline t##s##x##c \
60t##s##x##c##_interleave_lo (t##s##x##c a, t##s##x##c b) \
61{ return (t##s##x##c) _mm256_unpacklo_##i ((__m256i) a, (__m256i) b); } \
62\
63static_always_inline t##s##x##c \
64t##s##x##c##_interleave_hi (t##s##x##c a, t##s##x##c b) \
65{ return (t##s##x##c) _mm256_unpackhi_##i ((__m256i) a, (__m256i) b); } \
66
Damjan Marionc5766222018-04-16 00:18:34 +020067
68foreach_avx2_vec256i foreach_avx2_vec256u
69#undef _
Damjan Mariona52e1662018-05-19 00:04:23 +020070/* *INDENT-ON* */
71
72always_inline u32x8
Damjan Marionc5766222018-04-16 00:18:34 +020073u32x8_permute (u32x8 v, u32x8 idx)
74{
75 return (u32x8) _mm256_permutevar8x32_epi32 ((__m256i) v, (__m256i) idx);
76}
77
Damjan Marion1cf9a162018-05-23 20:21:51 +020078/* _extract_lo, _extract_hi */
79/* *INDENT-OFF* */
80#define _(t1,t2) \
81always_inline t1 \
82t2##_extract_lo (t2 v) \
83{ return (t1) _mm256_extracti128_si256 ((__m256i) v, 0); } \
84\
85always_inline t1 \
86t2##_extract_hi (t2 v) \
87{ return (t1) _mm256_extracti128_si256 ((__m256i) v, 1); } \
88\
89always_inline t2 \
90t2##_insert_lo (t2 v1, t1 v2) \
91{ return (t2) _mm256_inserti128_si256 ((__m256i) v1, (__m128i) v2, 0); }\
92\
93always_inline t2 \
94t2##_insert_hi (t2 v1, t1 v2) \
95{ return (t2) _mm256_inserti128_si256 ((__m256i) v1, (__m128i) v2, 1); }\
Damjan Marionc5766222018-04-16 00:18:34 +020096
Damjan Marion1cf9a162018-05-23 20:21:51 +020097_(u8x16, u8x32)
98_(u16x8, u16x16)
99_(u32x4, u32x8)
100_(u64x2, u64x4)
101#undef _
102/* *INDENT-ON* */
Damjan Marionc5766222018-04-16 00:18:34 +0200103
Damjan Marionee7f0bd2018-05-05 12:30:28 +0200104
Damjan Marion1cf9a162018-05-23 20:21:51 +0200105
Damjan Marionee7f0bd2018-05-05 12:30:28 +0200106
Damjan Marion8c3f8a22018-05-17 21:12:13 +0200107static_always_inline u32
108u8x32_msb_mask (u8x32 v)
109{
110 return _mm256_movemask_epi8 ((__m256i) v);
111}
112
Damjan Marionafe56de2018-05-17 12:44:00 +0200113/* _extend_to_ */
114/* *INDENT-OFF* */
115#define _(f,t,i) \
116static_always_inline t \
117f##_extend_to_##t (f x) \
118{ return (t) _mm256_cvt##i ((__m128i) x); }
119
120_(u16x8, u32x8, epu16_epi32)
121_(u16x8, u64x4, epu16_epi64)
122_(u32x4, u64x4, epu32_epi64)
123_(u8x16, u16x16, epu8_epi64)
124_(u8x16, u32x8, epu8_epi32)
125_(u8x16, u64x4, epu8_epi64)
126_(i16x8, i32x8, epi16_epi32)
127_(i16x8, i64x4, epi16_epi64)
128_(i32x4, i64x4, epi32_epi64)
129_(i8x16, i16x16, epi8_epi64)
130_(i8x16, i32x8, epi8_epi32)
131_(i8x16, i64x4, epi8_epi64)
132#undef _
133/* *INDENT-ON* */
134
Damjan Marionc899dac2019-04-16 18:41:01 +0200135static_always_inline u32x8
136u32x8_byte_swap (u32x8 v)
137{
138 u8x32 swap = {
139 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
140 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
141 };
142 return (u32x8) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
143}
144
Damjan Marionbf129f42018-06-27 13:03:26 +0200145static_always_inline u16x16
146u16x16_byte_swap (u16x16 v)
147{
148 u8x32 swap = {
149 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
150 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
151 };
152 return (u16x16) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
153}
154
155static_always_inline u32x8
156u32x8_hadd (u32x8 v1, u32x8 v2)
157{
158 return (u32x8) _mm256_hadd_epi32 ((__m256i) v1, (__m256i) v2);
159}
160
Damjan Marion08bca802018-06-18 22:21:40 +0200161static_always_inline u16x16
162u16x16_mask_last (u16x16 v, u8 n_last)
163{
164 const u16x16 masks[17] = {
165 {0},
166 {-1},
167 {-1, -1},
168 {-1, -1, -1},
169 {-1, -1, -1, -1},
170 {-1, -1, -1, -1, -1},
171 {-1, -1, -1, -1, -1, -1},
172 {-1, -1, -1, -1, -1, -1, -1},
173 {-1, -1, -1, -1, -1, -1, -1, -1},
174 {-1, -1, -1, -1, -1, -1, -1, -1, -1},
175 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
176 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
177 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
178 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
179 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
180 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
181 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
182 };
183
184 ASSERT (n_last < 17);
185
186 return v & masks[16 - n_last];
187}
188
Damjan Marion69fdfee2018-10-06 14:33:18 +0200189static_always_inline f32x8
190f32x8_from_u32x8 (u32x8 v)
191{
192 return (f32x8) _mm256_cvtepi32_ps ((__m256i) v);
193}
194
195static_always_inline u32x8
196u32x8_from_f32x8 (f32x8 v)
197{
198 return (u32x8) _mm256_cvttps_epi32 ((__m256) v);
199}
200
Damjan Marionc899dac2019-04-16 18:41:01 +0200201#define u32x8_blend(a,b,m) \
202 (u32x8) _mm256_blend_epi32 ((__m256i) a, (__m256i) b, m)
203
Damjan Marion07243572018-11-20 10:06:57 +0100204#define u16x16_blend(v1, v2, mask) \
205 (u16x16) _mm256_blend_epi16 ((__m256i) (v1), (__m256i) (v2), mask)
206
207static_always_inline u64x4
208u64x4_gather (void *p0, void *p1, void *p2, void *p3)
209{
210 u64x4 r = {
211 *(u64 *) p0, *(u64 *) p1, *(u64 *) p2, *(u64 *) p3
212 };
213 return r;
214}
215
216static_always_inline u32x8
217u32x8_gather (void *p0, void *p1, void *p2, void *p3, void *p4, void *p5,
218 void *p6, void *p7)
219{
220 u32x8 r = {
221 *(u32 *) p0, *(u32 *) p1, *(u32 *) p2, *(u32 *) p3,
222 *(u32 *) p4, *(u32 *) p5, *(u32 *) p6, *(u32 *) p7,
223 };
224 return r;
225}
226
227
228static_always_inline void
229u64x4_scatter (u64x4 r, void *p0, void *p1, void *p2, void *p3)
230{
231 *(u64 *) p0 = r[0];
232 *(u64 *) p1 = r[1];
233 *(u64 *) p2 = r[2];
234 *(u64 *) p3 = r[3];
235}
236
237static_always_inline void
238u32x8_scatter (u32x8 r, void *p0, void *p1, void *p2, void *p3, void *p4,
239 void *p5, void *p6, void *p7)
240{
241 *(u32 *) p0 = r[0];
242 *(u32 *) p1 = r[1];
243 *(u32 *) p2 = r[2];
244 *(u32 *) p3 = r[3];
245 *(u32 *) p4 = r[4];
246 *(u32 *) p5 = r[5];
247 *(u32 *) p6 = r[6];
248 *(u32 *) p7 = r[7];
249}
250
251static_always_inline void
252u64x4_scatter_one (u64x4 r, int index, void *p)
253{
254 *(u64 *) p = r[index];
255}
256
257static_always_inline void
258u32x8_scatter_one (u32x8 r, int index, void *p)
259{
260 *(u32 *) p = r[index];
261}
262
Damjan Marionc59b9a22019-03-19 15:38:40 +0100263static_always_inline u8x32
264u8x32_is_greater (u8x32 v1, u8x32 v2)
265{
266 return (u8x32) _mm256_cmpgt_epi8 ((__m256i) v1, (__m256i) v2);
267}
268
269static_always_inline u8x32
270u8x32_blend (u8x32 v1, u8x32 v2, u8x32 mask)
271{
272 return (u8x32) _mm256_blendv_epi8 ((__m256i) v1, (__m256i) v2,
273 (__m256i) mask);
274}
275
Damjan Marion9f7e33d2019-04-08 10:14:51 +0200276#define u32x8_permute_lanes(a, b, m) \
277 (u32x8) _mm256_permute2x128_si256 ((__m256i) a, (__m256i) b, m)
278#define u64x4_permute_lanes(a, b, m) \
279 (u64x4) _mm256_permute2x128_si256 ((__m256i) a, (__m256i) b, m)
280
Damjan Marionc899dac2019-04-16 18:41:01 +0200281static_always_inline u32x8
282u32x8_min (u32x8 a, u32x8 b)
283{
284 return (u32x8) _mm256_min_epu32 ((__m256i) a, (__m256i) b);
285}
286
287static_always_inline u32
288u32x8_min_scalar (u32x8 v)
289{
290 return u32x4_min_scalar (u32x4_min (u32x8_extract_lo (v),
291 u32x8_extract_hi (v)));
292}
293
Damjan Marion9f7e33d2019-04-08 10:14:51 +0200294static_always_inline void
295u32x8_transpose (u32x8 a[8])
296{
297 u64x4 r[8], x, y;
298
299 r[0] = (u64x4) u32x8_interleave_lo (a[0], a[1]);
300 r[1] = (u64x4) u32x8_interleave_hi (a[0], a[1]);
301 r[2] = (u64x4) u32x8_interleave_lo (a[2], a[3]);
302 r[3] = (u64x4) u32x8_interleave_hi (a[2], a[3]);
303 r[4] = (u64x4) u32x8_interleave_lo (a[4], a[5]);
304 r[5] = (u64x4) u32x8_interleave_hi (a[4], a[5]);
305 r[6] = (u64x4) u32x8_interleave_lo (a[6], a[7]);
306 r[7] = (u64x4) u32x8_interleave_hi (a[6], a[7]);
307
308 x = u64x4_interleave_lo (r[0], r[2]);
309 y = u64x4_interleave_lo (r[4], r[6]);
310 a[0] = u32x8_permute_lanes (x, y, 0x20);
311 a[4] = u32x8_permute_lanes (x, y, 0x31);
312
313 x = u64x4_interleave_hi (r[0], r[2]);
314 y = u64x4_interleave_hi (r[4], r[6]);
315 a[1] = u32x8_permute_lanes (x, y, 0x20);
316 a[5] = u32x8_permute_lanes (x, y, 0x31);
317
318 x = u64x4_interleave_lo (r[1], r[3]);
319 y = u64x4_interleave_lo (r[5], r[7]);
320 a[2] = u32x8_permute_lanes (x, y, 0x20);
321 a[6] = u32x8_permute_lanes (x, y, 0x31);
322
323 x = u64x4_interleave_hi (r[1], r[3]);
324 y = u64x4_interleave_hi (r[5], r[7]);
325 a[3] = u32x8_permute_lanes (x, y, 0x20);
326 a[7] = u32x8_permute_lanes (x, y, 0x31);
327}
328
329static_always_inline void
330u64x4_transpose (u64x4 a[8])
331{
332 u64x4 r[4];
333
334 r[0] = u64x4_interleave_lo (a[0], a[1]);
335 r[1] = u64x4_interleave_hi (a[0], a[1]);
336 r[2] = u64x4_interleave_lo (a[2], a[3]);
337 r[3] = u64x4_interleave_hi (a[2], a[3]);
338
339 a[0] = u64x4_permute_lanes (r[0], r[2], 0x20);
340 a[1] = u64x4_permute_lanes (r[1], r[3], 0x20);
341 a[2] = u64x4_permute_lanes (r[0], r[2], 0x31);
342 a[3] = u64x4_permute_lanes (r[1], r[3], 0x31);
343}
344
Damjan Marionc5766222018-04-16 00:18:34 +0200345#endif /* included_vector_avx2_h */
346
347/*
348 * fd.io coding-style-patch-verification: ON
349 *
350 * Local Variables:
351 * eval: (c-set-style "gnu")
352 * End:
353 */