blob: 7e75ad287101f9ba3e2b88987e2c8df09a30356b [file] [log] [blame]
Ed Warnickecb9cada2015-12-08 15:45:58 -07001/*
2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15/*
16 Copyright (c) 2005 Eliot Dresselhaus
17
18 Permission is hereby granted, free of charge, to any person obtaining
19 a copy of this software and associated documentation files (the
20 "Software"), to deal in the Software without restriction, including
21 without limitation the rights to use, copy, modify, merge, publish,
22 distribute, sublicense, and/or sell copies of the Software, and to
23 permit persons to whom the Software is furnished to do so, subject to
24 the following conditions:
25
26 The above copyright notice and this permission notice shall be
27 included in all copies or substantial portions of the Software.
28
29 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
33 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
34 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
35 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
36*/
37
38#ifndef included_vector_sse2_h
39#define included_vector_sse2_h
40
Dave Barachc3799992016-08-15 11:12:27 -040041#include <vppinfra/error_bootstrap.h> /* for ASSERT */
Ed Warnickecb9cada2015-12-08 15:45:58 -070042#include <x86intrin.h>
43
Damjan Mariona52e1662018-05-19 00:04:23 +020044/* *INDENT-OFF* */
45#define foreach_sse42_vec128i \
46 _(i,8,16,epi8) _(i,16,8,epi16) _(i,32,4,epi32) _(i,64,2,epi64x)
47#define foreach_sse42_vec128u \
48 _(u,8,16,epi8) _(u,16,8,epi16) _(u,32,4,epi32) _(u,64,2,epi64x)
49#define foreach_sse42_vec128f \
50 _(f,32,4,ps) _(f,64,2,pd)
51
52/* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
53 is_all_equal */
54#define _(t, s, c, i) \
55static_always_inline t##s##x##c \
56t##s##x##c##_splat (t##s x) \
57{ return (t##s##x##c) _mm_set1_##i (x); } \
58\
59static_always_inline t##s##x##c \
60t##s##x##c##_load_unaligned (void *p) \
61{ return (t##s##x##c) _mm_loadu_si128 (p); } \
62\
63static_always_inline void \
64t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
65{ _mm_storeu_si128 ((__m128i *) p, (__m128i) v); } \
66\
67static_always_inline int \
68t##s##x##c##_is_all_zero (t##s##x##c x) \
69{ return _mm_testz_si128 ((__m128i) x, (__m128i) x); } \
70\
71static_always_inline int \
Damjan Marion14864772018-05-22 14:07:47 +020072t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
73{ return t##s##x##c##_is_all_zero (a ^ b); } \
Damjan Mariona52e1662018-05-19 00:04:23 +020074\
75static_always_inline int \
76t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
77{ return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); }; \
78
79foreach_sse42_vec128i foreach_sse42_vec128u
80#undef _
Damjan Marionf6adf1f2019-04-02 19:06:50 +020081
82/* min, max */
83#define _(t, s, c, i) \
84static_always_inline t##s##x##c \
85t##s##x##c##_min (t##s##x##c a, t##s##x##c b) \
86{ return (t##s##x##c) _mm_min_##i ((__m128i) a, (__m128i) b); } \
87\
88static_always_inline t##s##x##c \
89t##s##x##c##_max (t##s##x##c a, t##s##x##c b) \
90{ return (t##s##x##c) _mm_max_##i ((__m128i) a, (__m128i) b); } \
91
92_(i,8,16,epi8) _(i,16,8,epi16) _(i,32,4,epi32) _(i,64,2,epi64)
93_(u,8,16,epu8) _(u,16,8,epu16) _(u,32,4,epu32) _(u,64,2,epu64)
94#undef _
Damjan Mariona52e1662018-05-19 00:04:23 +020095/* *INDENT-ON* */
96
97#define CLIB_VEC128_SPLAT_DEFINED
98#define CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE
99
Ed Warnickecb9cada2015-12-08 15:45:58 -0700100/* 128 bit interleaves. */
Dave Barachc3799992016-08-15 11:12:27 -0400101always_inline u8x16
102u8x16_interleave_hi (u8x16 a, u8x16 b)
103{
104 return (u8x16) _mm_unpackhi_epi8 ((__m128i) a, (__m128i) b);
105}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700106
Dave Barachc3799992016-08-15 11:12:27 -0400107always_inline u8x16
108u8x16_interleave_lo (u8x16 a, u8x16 b)
109{
110 return (u8x16) _mm_unpacklo_epi8 ((__m128i) a, (__m128i) b);
111}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700112
Dave Barachc3799992016-08-15 11:12:27 -0400113always_inline u16x8
114u16x8_interleave_hi (u16x8 a, u16x8 b)
115{
116 return (u16x8) _mm_unpackhi_epi16 ((__m128i) a, (__m128i) b);
117}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700118
Dave Barachc3799992016-08-15 11:12:27 -0400119always_inline u16x8
120u16x8_interleave_lo (u16x8 a, u16x8 b)
121{
122 return (u16x8) _mm_unpacklo_epi16 ((__m128i) a, (__m128i) b);
123}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700124
Dave Barachc3799992016-08-15 11:12:27 -0400125always_inline u32x4
126u32x4_interleave_hi (u32x4 a, u32x4 b)
127{
128 return (u32x4) _mm_unpackhi_epi32 ((__m128i) a, (__m128i) b);
129}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700130
Dave Barachc3799992016-08-15 11:12:27 -0400131always_inline u32x4
132u32x4_interleave_lo (u32x4 a, u32x4 b)
133{
134 return (u32x4) _mm_unpacklo_epi32 ((__m128i) a, (__m128i) b);
135}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700136
Dave Barachc3799992016-08-15 11:12:27 -0400137always_inline u64x2
138u64x2_interleave_hi (u64x2 a, u64x2 b)
139{
140 return (u64x2) _mm_unpackhi_epi64 ((__m128i) a, (__m128i) b);
141}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700142
Dave Barachc3799992016-08-15 11:12:27 -0400143always_inline u64x2
144u64x2_interleave_lo (u64x2 a, u64x2 b)
145{
146 return (u64x2) _mm_unpacklo_epi64 ((__m128i) a, (__m128i) b);
147}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700148
Ed Warnickecb9cada2015-12-08 15:45:58 -0700149/* 128 bit packs. */
Damjan Marion7d14aad2021-05-05 19:31:41 +0200150#define _(f, t, fn) \
151 always_inline t t##_pack (f lo, f hi) \
152 { \
153 return (t) fn ((__m128i) lo, (__m128i) hi); \
154 }
Ed Warnickecb9cada2015-12-08 15:45:58 -0700155
Damjan Marion7d14aad2021-05-05 19:31:41 +0200156_ (i16x8, i8x16, _mm_packs_epi16)
157_ (i16x8, u8x16, _mm_packus_epi16)
158_ (i32x4, i16x8, _mm_packs_epi32)
159_ (i32x4, u16x8, _mm_packus_epi32)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700160
Damjan Marion7d14aad2021-05-05 19:31:41 +0200161#undef _
Ed Warnickecb9cada2015-12-08 15:45:58 -0700162
Ed Warnickecb9cada2015-12-08 15:45:58 -0700163#define _signed_binop(n,m,f,g) \
164 /* Unsigned */ \
165 always_inline u##n##x##m \
166 u##n##x##m##_##f (u##n##x##m x, u##n##x##m y) \
167 { return (u##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); } \
168 \
169 /* Signed */ \
170 always_inline i##n##x##m \
171 i##n##x##m##_##f (i##n##x##m x, i##n##x##m y) \
172 { return (i##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); }
Ed Warnickecb9cada2015-12-08 15:45:58 -0700173/* Addition/subtraction with saturation. */
Damjan Mariona52e1662018-05-19 00:04:23 +0200174_signed_binop (8, 16, add_saturate, adds_epu)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700175_signed_binop (16, 8, add_saturate, adds_epu)
176_signed_binop (8, 16, sub_saturate, subs_epu)
177_signed_binop (16, 8, sub_saturate, subs_epu)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700178/* Multiplication. */
Dave Barachc3799992016-08-15 11:12:27 -0400179 always_inline i16x8 i16x8_mul_lo (i16x8 x, i16x8 y)
180{
181 return (i16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
182}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700183
Dave Barachc3799992016-08-15 11:12:27 -0400184always_inline u16x8
185u16x8_mul_lo (u16x8 x, u16x8 y)
186{
187 return (u16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
188}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700189
Dave Barachc3799992016-08-15 11:12:27 -0400190always_inline i16x8
191i16x8_mul_hi (i16x8 x, i16x8 y)
192{
193 return (i16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
194}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700195
Dave Barachc3799992016-08-15 11:12:27 -0400196always_inline u16x8
197u16x8_mul_hi (u16x8 x, u16x8 y)
198{
199 return (u16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
200}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700201
202/* 128 bit shifts. */
203
204#define _(p,a,b,c,f) \
205 always_inline p##a##x##b p##a##x##b##_ishift_##c (p##a##x##b x, int i) \
206 { return (p##a##x##b) _mm_##f##i_epi##a ((__m128i) x, i); } \
207 \
208 always_inline p##a##x##b p##a##x##b##_shift_##c (p##a##x##b x, p##a##x##b y) \
209 { return (p##a##x##b) _mm_##f##_epi##a ((__m128i) x, (__m128i) y); }
210
Dave Barachc3799992016-08-15 11:12:27 -0400211_(u, 16, 8, left, sll)
212_(u, 32, 4, left, sll)
213_(u, 64, 2, left, sll)
214_(u, 16, 8, right, srl)
215_(u, 32, 4, right, srl)
216_(u, 64, 2, right, srl)
217_(i, 16, 8, left, sll)
218_(i, 32, 4, left, sll)
219_(i, 64, 2, left, sll) _(i, 16, 8, right, sra) _(i, 32, 4, right, sra)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700220#undef _
Ed Warnickecb9cada2015-12-08 15:45:58 -0700221
222#define u8x16_word_shift_left(a,n) (u8x16) _mm_slli_si128((__m128i) a, n)
223#define u8x16_word_shift_right(a,n) (u8x16) _mm_srli_si128((__m128i) a, n)
224
225#define i8x16_word_shift_left(a,n) \
226 ((i8x16) u8x16_word_shift_left((u8x16) (a), (n)))
227#define i8x16_word_shift_right(a,n) \
228 ((i8x16) u8x16_word_shift_right((u8x16) (a), (n)))
229
230#define u16x8_word_shift_left(a,n) \
231 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
232#define i16x8_word_shift_left(a,n) \
233 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
234#define u16x8_word_shift_right(a,n) \
235 ((u16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
236#define i16x8_word_shift_right(a,n) \
237 ((i16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
238
239#define u32x4_word_shift_left(a,n) \
240 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
241#define i32x4_word_shift_left(a,n) \
242 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
243#define u32x4_word_shift_right(a,n) \
244 ((u32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
245#define i32x4_word_shift_right(a,n) \
246 ((i32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
247
248#define u64x2_word_shift_left(a,n) \
249 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
250#define i64x2_word_shift_left(a,n) \
251 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
252#define u64x2_word_shift_right(a,n) \
253 ((u64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
254#define i64x2_word_shift_right(a,n) \
255 ((i64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
256
257/* SSE2 has no rotate instructions: use shifts to simulate them. */
258#define _(t,n,lr1,lr2) \
259 always_inline t##x##n \
260 t##x##n##_irotate_##lr1 (t##x##n w, int i) \
261 { \
262 ASSERT (i >= 0 && i <= BITS (t)); \
263 return (t##x##n##_ishift_##lr1 (w, i) \
264 | t##x##n##_ishift_##lr2 (w, BITS (t) - i)); \
265 } \
266 \
267 always_inline t##x##n \
268 t##x##n##_rotate_##lr1 (t##x##n w, t##x##n i) \
269 { \
270 t##x##n j = t##x##n##_splat (BITS (t)); \
271 return (t##x##n##_shift_##lr1 (w, i) \
272 | t##x##n##_shift_##lr2 (w, j - i)); \
273 }
274
Dave Barachc3799992016-08-15 11:12:27 -0400275_(u16, 8, left, right);
276_(u16, 8, right, left);
277_(u32, 4, left, right);
278_(u32, 4, right, left);
279_(u64, 2, left, right);
280_(u64, 2, right, left);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700281
282#undef _
283
Dave Barachc3799992016-08-15 11:12:27 -0400284always_inline u32
285u8x16_max_scalar (u8x16 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700286{
287 x = u8x16_max (x, u8x16_word_shift_right (x, 8));
288 x = u8x16_max (x, u8x16_word_shift_right (x, 4));
289 x = u8x16_max (x, u8x16_word_shift_right (x, 2));
290 x = u8x16_max (x, u8x16_word_shift_right (x, 1));
291 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
292}
293
Dave Barachc3799992016-08-15 11:12:27 -0400294always_inline u8
295u8x16_min_scalar (u8x16 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700296{
297 x = u8x16_min (x, u8x16_word_shift_right (x, 8));
298 x = u8x16_min (x, u8x16_word_shift_right (x, 4));
299 x = u8x16_min (x, u8x16_word_shift_right (x, 2));
300 x = u8x16_min (x, u8x16_word_shift_right (x, 1));
301 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
302}
303
Dave Barachc3799992016-08-15 11:12:27 -0400304always_inline i16
305i16x8_max_scalar (i16x8 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700306{
307 x = i16x8_max (x, i16x8_word_shift_right (x, 4));
308 x = i16x8_max (x, i16x8_word_shift_right (x, 2));
309 x = i16x8_max (x, i16x8_word_shift_right (x, 1));
310 return _mm_extract_epi16 ((__m128i) x, 0);
311}
312
Dave Barachc3799992016-08-15 11:12:27 -0400313always_inline i16
314i16x8_min_scalar (i16x8 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700315{
316 x = i16x8_min (x, i16x8_word_shift_right (x, 4));
317 x = i16x8_min (x, i16x8_word_shift_right (x, 2));
318 x = i16x8_min (x, i16x8_word_shift_right (x, 1));
319 return _mm_extract_epi16 ((__m128i) x, 0);
320}
321
Damjan Marionf6adf1f2019-04-02 19:06:50 +0200322#define u8x16_align_right(a, b, imm) \
323 (u8x16) _mm_alignr_epi8 ((__m128i) a, (__m128i) b, imm)
324
325static_always_inline u32
326u32x4_min_scalar (u32x4 v)
327{
328 v = u32x4_min (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
329 v = u32x4_min (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
330 return v[0];
331}
332
333static_always_inline u32
334u32x4_max_scalar (u32x4 v)
335{
336 v = u32x4_max (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
337 v = u32x4_max (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
338 return v[0];
339}
340
341static_always_inline u32
342i32x4_min_scalar (i32x4 v)
343{
344 v = i32x4_min (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
345 v = i32x4_min (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
346 return v[0];
347}
348
349static_always_inline u32
350i32x4_max_scalar (i32x4 v)
351{
352 v = i32x4_max (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
353 v = i32x4_max (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
354 return v[0];
355}
356
Damjan Marion8c3f8a22018-05-17 21:12:13 +0200357static_always_inline u16
358u8x16_msb_mask (u8x16 v)
359{
360 return _mm_movemask_epi8 ((__m128i) v);
361}
362
Damjan Marionef0bac72021-04-22 18:08:28 +0200363static_always_inline u16
364i8x16_msb_mask (i8x16 v)
365{
366 return _mm_movemask_epi8 ((__m128i) v);
367}
368
Damjan Marion398fdc12018-05-19 10:27:10 +0200369#define CLIB_HAVE_VEC128_MSB_MASK
370
Ed Warnickecb9cada2015-12-08 15:45:58 -0700371#undef _signed_binop
372
Damjan Marioncf18ca92019-04-13 00:13:34 +0200373static_always_inline u32x4
374u32x4_byte_swap (u32x4 v)
375{
376 u8x16 swap = {
377 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
378 };
379 return (u32x4) _mm_shuffle_epi8 ((__m128i) v, (__m128i) swap);
380}
381
Damjan Marionbf129f42018-06-27 13:03:26 +0200382static_always_inline u16x8
383u16x8_byte_swap (u16x8 v)
384{
385 u8x16 swap = {
386 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
387 };
388 return (u16x8) _mm_shuffle_epi8 ((__m128i) v, (__m128i) swap);
389}
390
Damjan Marion622b5ce2020-02-12 10:59:14 +0100391static_always_inline u8x16
392u8x16_reflect (u8x16 v)
393{
394 u8x16 mask = {
395 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
396 };
397 return (u8x16) _mm_shuffle_epi8 ((__m128i) v, (__m128i) mask);
398}
399
Damjan Marionbf129f42018-06-27 13:03:26 +0200400static_always_inline u32x4
401u32x4_hadd (u32x4 v1, u32x4 v2)
402{
403 return (u32x4) _mm_hadd_epi32 ((__m128i) v1, (__m128i) v2);
404}
405
Damjan Marion94dbf952020-07-15 20:18:39 +0200406static_always_inline u32 __clib_unused
407u32x4_sum_elts (u32x4 sum4)
408{
409 sum4 += (u32x4) u8x16_align_right (sum4, sum4, 8);
410 sum4 += (u32x4) u8x16_align_right (sum4, sum4, 4);
411 return sum4[0];
412}
413
Damjan Marion48a87f22018-07-26 10:25:58 +0200414static_always_inline u8x16
415u8x16_shuffle (u8x16 v, u8x16 m)
416{
417 return (u8x16) _mm_shuffle_epi8 ((__m128i) v, (__m128i) m);
418}
419
Damjan Marion5df580e2018-07-27 01:47:57 +0200420static_always_inline u32x4
421u32x4_shuffle (u32x4 v, const int a, const int b, const int c, const int d)
422{
Damjan Marion612dd6a2018-07-30 12:45:07 +0200423#if defined(__clang__) || !__OPTIMIZE__
Damjan Marion5df580e2018-07-27 01:47:57 +0200424 u32x4 r = { v[a], v[b], v[c], v[d] };
425 return r;
426#else
427 return (u32x4) _mm_shuffle_epi32 ((__m128i) v,
428 a | b << 2 | c << 4 | d << 6);
429#endif
430}
431
Damjan Marion90d05bc2020-08-31 17:18:26 +0200432/* _from_ */
Damjan Marion5df580e2018-07-27 01:47:57 +0200433/* *INDENT-OFF* */
434#define _(f,t,i) \
435static_always_inline t \
Damjan Marion90d05bc2020-08-31 17:18:26 +0200436t##_from_##f (f x) \
Damjan Marion5df580e2018-07-27 01:47:57 +0200437{ return (t) _mm_cvt##i ((__m128i) x); }
438
439_(u8x16, u16x8, epu8_epi16)
440_(u8x16, u32x4, epu8_epi32)
441_(u8x16, u64x2, epu8_epi64)
442_(u16x8, u32x4, epu16_epi32)
443_(u16x8, u64x2, epu16_epi64)
444_(u32x4, u64x2, epu32_epi64)
445
446_(i8x16, i16x8, epi8_epi16)
447_(i8x16, i32x4, epi8_epi32)
448_(i8x16, i64x2, epi8_epi64)
449_(i16x8, i32x4, epi16_epi32)
450_(i16x8, i64x2, epi16_epi64)
451_(i32x4, i64x2, epi32_epi64)
452#undef _
453/* *INDENT-ON* */
454
Damjan Marion07243572018-11-20 10:06:57 +0100455static_always_inline u64x2
456u64x2_gather (void *p0, void *p1)
457{
458 u64x2 r = { *(u64 *) p0, *(u64 *) p1 };
459 return r;
460}
461
462static_always_inline u32x4
Damjan Marion97b9e002020-06-23 19:01:56 +0200463u32x4_gather (void *p0, void *p1, void *p2, void *p3)
Damjan Marion07243572018-11-20 10:06:57 +0100464{
465 u32x4 r = { *(u32 *) p0, *(u32 *) p1, *(u32 *) p2, *(u32 *) p3 };
466 return r;
467}
468
469
470static_always_inline void
471u64x2_scatter (u64x2 r, void *p0, void *p1)
472{
473 *(u64 *) p0 = r[0];
474 *(u64 *) p1 = r[1];
475}
476
477static_always_inline void
478u32x4_scatter (u32x4 r, void *p0, void *p1, void *p2, void *p3)
479{
480 *(u32 *) p0 = r[0];
481 *(u32 *) p1 = r[1];
482 *(u32 *) p2 = r[2];
483 *(u32 *) p3 = r[3];
484}
485
486static_always_inline void
487u64x2_scatter_one (u64x2 r, int index, void *p)
488{
489 *(u64 *) p = r[index];
490}
491
492static_always_inline void
493u32x4_scatter_one (u32x4 r, int index, void *p)
494{
495 *(u32 *) p = r[index];
496}
497
Damjan Marionc59b9a22019-03-19 15:38:40 +0100498static_always_inline u8x16
499u8x16_is_greater (u8x16 v1, u8x16 v2)
500{
501 return (u8x16) _mm_cmpgt_epi8 ((__m128i) v1, (__m128i) v2);
502}
503
504static_always_inline u8x16
505u8x16_blend (u8x16 v1, u8x16 v2, u8x16 mask)
506{
507 return (u8x16) _mm_blendv_epi8 ((__m128i) v1, (__m128i) v2, (__m128i) mask);
508}
509
Damjan Marionf75defa2020-02-13 18:14:06 +0100510static_always_inline u8x16
511u8x16_xor3 (u8x16 a, u8x16 b, u8x16 c)
512{
513#if __AVX512F__
514 return (u8x16) _mm_ternarylogic_epi32 ((__m128i) a, (__m128i) b,
515 (__m128i) c, 0x96);
516#endif
517 return a ^ b ^ c;
518}
Damjan Marion5df580e2018-07-27 01:47:57 +0200519
Ed Warnickecb9cada2015-12-08 15:45:58 -0700520#endif /* included_vector_sse2_h */
Dave Barachc3799992016-08-15 11:12:27 -0400521
522/*
523 * fd.io coding-style-patch-verification: ON
524 *
525 * Local Variables:
526 * eval: (c-set-style "gnu")
527 * End:
528 */