blob: 51625618823c94c6abfd6a9e5cb17d7ed9ad881f [file] [log] [blame]
Damjan Marionc5766222018-04-16 00:18:34 +02001/*
2 * Copyright (c) 2018 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#ifndef included_vector_avx2_h
17#define included_vector_avx2_h
18
19#include <vppinfra/clib.h>
20#include <x86intrin.h>
21
Damjan Mariona52e1662018-05-19 00:04:23 +020022/* *INDENT-OFF* */
Damjan Marionc5766222018-04-16 00:18:34 +020023#define foreach_avx2_vec256i \
Damjan Marion4fce7f72018-07-16 14:18:23 +020024 _(i,8,32,epi8) _(i,16,16,epi16) _(i,32,8,epi32) _(i,64,4,epi64)
Damjan Marionc5766222018-04-16 00:18:34 +020025#define foreach_avx2_vec256u \
Damjan Marion4fce7f72018-07-16 14:18:23 +020026 _(u,8,32,epi8) _(u,16,16,epi16) _(u,32,8,epi32) _(u,64,4,epi64)
Damjan Marionc5766222018-04-16 00:18:34 +020027#define foreach_avx2_vec256f \
28 _(f,32,8,ps) _(f,64,4,pd)
29
Damjan Marion4fce7f72018-07-16 14:18:23 +020030#define _mm256_set1_epi64 _mm256_set1_epi64x
31
Damjan Mariona52e1662018-05-19 00:04:23 +020032/* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
33 is_all_equal */
Damjan Marionc5766222018-04-16 00:18:34 +020034#define _(t, s, c, i) \
35static_always_inline t##s##x##c \
36t##s##x##c##_splat (t##s x) \
37{ return (t##s##x##c) _mm256_set1_##i (x); } \
38\
39static_always_inline t##s##x##c \
40t##s##x##c##_load_unaligned (void *p) \
41{ return (t##s##x##c) _mm256_loadu_si256 (p); } \
42\
43static_always_inline void \
44t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
45{ _mm256_storeu_si256 ((__m256i *) p, (__m256i) v); } \
46\
47static_always_inline int \
48t##s##x##c##_is_all_zero (t##s##x##c x) \
49{ return _mm256_testz_si256 ((__m256i) x, (__m256i) x); } \
50\
51static_always_inline int \
Damjan Marion14864772018-05-22 14:07:47 +020052t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
53{ return t##s##x##c##_is_all_zero (a ^ b); } \
Damjan Marionc5766222018-04-16 00:18:34 +020054\
Damjan Mariona52e1662018-05-19 00:04:23 +020055static_always_inline int \
56t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
Damjan Marion4fce7f72018-07-16 14:18:23 +020057{ return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); } \
58\
59static_always_inline t##s##x##c \
60t##s##x##c##_interleave_lo (t##s##x##c a, t##s##x##c b) \
61{ return (t##s##x##c) _mm256_unpacklo_##i ((__m256i) a, (__m256i) b); } \
62\
63static_always_inline t##s##x##c \
64t##s##x##c##_interleave_hi (t##s##x##c a, t##s##x##c b) \
65{ return (t##s##x##c) _mm256_unpackhi_##i ((__m256i) a, (__m256i) b); } \
66
Damjan Marionc5766222018-04-16 00:18:34 +020067
68foreach_avx2_vec256i foreach_avx2_vec256u
69#undef _
Damjan Mariona52e1662018-05-19 00:04:23 +020070/* *INDENT-ON* */
71
72always_inline u32x8
Damjan Marionc5766222018-04-16 00:18:34 +020073u32x8_permute (u32x8 v, u32x8 idx)
74{
75 return (u32x8) _mm256_permutevar8x32_epi32 ((__m256i) v, (__m256i) idx);
76}
77
Damjan Marion1cf9a162018-05-23 20:21:51 +020078/* _extract_lo, _extract_hi */
79/* *INDENT-OFF* */
80#define _(t1,t2) \
81always_inline t1 \
82t2##_extract_lo (t2 v) \
83{ return (t1) _mm256_extracti128_si256 ((__m256i) v, 0); } \
84\
85always_inline t1 \
86t2##_extract_hi (t2 v) \
87{ return (t1) _mm256_extracti128_si256 ((__m256i) v, 1); } \
88\
89always_inline t2 \
90t2##_insert_lo (t2 v1, t1 v2) \
91{ return (t2) _mm256_inserti128_si256 ((__m256i) v1, (__m128i) v2, 0); }\
92\
93always_inline t2 \
94t2##_insert_hi (t2 v1, t1 v2) \
95{ return (t2) _mm256_inserti128_si256 ((__m256i) v1, (__m128i) v2, 1); }\
Damjan Marionc5766222018-04-16 00:18:34 +020096
Damjan Marion1cf9a162018-05-23 20:21:51 +020097_(u8x16, u8x32)
98_(u16x8, u16x16)
99_(u32x4, u32x8)
100_(u64x2, u64x4)
101#undef _
102/* *INDENT-ON* */
Damjan Marionc5766222018-04-16 00:18:34 +0200103
Damjan Marionee7f0bd2018-05-05 12:30:28 +0200104
Damjan Marion1cf9a162018-05-23 20:21:51 +0200105
Damjan Marionee7f0bd2018-05-05 12:30:28 +0200106
Damjan Marion8c3f8a22018-05-17 21:12:13 +0200107static_always_inline u32
108u8x32_msb_mask (u8x32 v)
109{
110 return _mm256_movemask_epi8 ((__m256i) v);
111}
112
Damjan Marionafe56de2018-05-17 12:44:00 +0200113/* _extend_to_ */
114/* *INDENT-OFF* */
115#define _(f,t,i) \
116static_always_inline t \
117f##_extend_to_##t (f x) \
118{ return (t) _mm256_cvt##i ((__m128i) x); }
119
120_(u16x8, u32x8, epu16_epi32)
121_(u16x8, u64x4, epu16_epi64)
122_(u32x4, u64x4, epu32_epi64)
123_(u8x16, u16x16, epu8_epi64)
124_(u8x16, u32x8, epu8_epi32)
125_(u8x16, u64x4, epu8_epi64)
126_(i16x8, i32x8, epi16_epi32)
127_(i16x8, i64x4, epi16_epi64)
128_(i32x4, i64x4, epi32_epi64)
129_(i8x16, i16x16, epi8_epi64)
130_(i8x16, i32x8, epi8_epi32)
131_(i8x16, i64x4, epi8_epi64)
132#undef _
133/* *INDENT-ON* */
134
Damjan Marionbf129f42018-06-27 13:03:26 +0200135static_always_inline u16x16
136u16x16_byte_swap (u16x16 v)
137{
138 u8x32 swap = {
139 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
140 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
141 };
142 return (u16x16) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
143}
144
145static_always_inline u32x8
146u32x8_hadd (u32x8 v1, u32x8 v2)
147{
148 return (u32x8) _mm256_hadd_epi32 ((__m256i) v1, (__m256i) v2);
149}
150
Damjan Marion08bca802018-06-18 22:21:40 +0200151static_always_inline u16x16
152u16x16_mask_last (u16x16 v, u8 n_last)
153{
154 const u16x16 masks[17] = {
155 {0},
156 {-1},
157 {-1, -1},
158 {-1, -1, -1},
159 {-1, -1, -1, -1},
160 {-1, -1, -1, -1, -1},
161 {-1, -1, -1, -1, -1, -1},
162 {-1, -1, -1, -1, -1, -1, -1},
163 {-1, -1, -1, -1, -1, -1, -1, -1},
164 {-1, -1, -1, -1, -1, -1, -1, -1, -1},
165 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
166 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
167 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
168 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
169 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
170 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
171 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
172 };
173
174 ASSERT (n_last < 17);
175
176 return v & masks[16 - n_last];
177}
178
Damjan Marion69fdfee2018-10-06 14:33:18 +0200179static_always_inline f32x8
180f32x8_from_u32x8 (u32x8 v)
181{
182 return (f32x8) _mm256_cvtepi32_ps ((__m256i) v);
183}
184
185static_always_inline u32x8
186u32x8_from_f32x8 (f32x8 v)
187{
188 return (u32x8) _mm256_cvttps_epi32 ((__m256) v);
189}
190
Damjan Marion07243572018-11-20 10:06:57 +0100191#define u16x16_blend(v1, v2, mask) \
192 (u16x16) _mm256_blend_epi16 ((__m256i) (v1), (__m256i) (v2), mask)
193
194static_always_inline u64x4
195u64x4_gather (void *p0, void *p1, void *p2, void *p3)
196{
197 u64x4 r = {
198 *(u64 *) p0, *(u64 *) p1, *(u64 *) p2, *(u64 *) p3
199 };
200 return r;
201}
202
203static_always_inline u32x8
204u32x8_gather (void *p0, void *p1, void *p2, void *p3, void *p4, void *p5,
205 void *p6, void *p7)
206{
207 u32x8 r = {
208 *(u32 *) p0, *(u32 *) p1, *(u32 *) p2, *(u32 *) p3,
209 *(u32 *) p4, *(u32 *) p5, *(u32 *) p6, *(u32 *) p7,
210 };
211 return r;
212}
213
214
215static_always_inline void
216u64x4_scatter (u64x4 r, void *p0, void *p1, void *p2, void *p3)
217{
218 *(u64 *) p0 = r[0];
219 *(u64 *) p1 = r[1];
220 *(u64 *) p2 = r[2];
221 *(u64 *) p3 = r[3];
222}
223
224static_always_inline void
225u32x8_scatter (u32x8 r, void *p0, void *p1, void *p2, void *p3, void *p4,
226 void *p5, void *p6, void *p7)
227{
228 *(u32 *) p0 = r[0];
229 *(u32 *) p1 = r[1];
230 *(u32 *) p2 = r[2];
231 *(u32 *) p3 = r[3];
232 *(u32 *) p4 = r[4];
233 *(u32 *) p5 = r[5];
234 *(u32 *) p6 = r[6];
235 *(u32 *) p7 = r[7];
236}
237
238static_always_inline void
239u64x4_scatter_one (u64x4 r, int index, void *p)
240{
241 *(u64 *) p = r[index];
242}
243
244static_always_inline void
245u32x8_scatter_one (u32x8 r, int index, void *p)
246{
247 *(u32 *) p = r[index];
248}
249
Damjan Marionc5766222018-04-16 00:18:34 +0200250#endif /* included_vector_avx2_h */
251
252/*
253 * fd.io coding-style-patch-verification: ON
254 *
255 * Local Variables:
256 * eval: (c-set-style "gnu")
257 * End:
258 */