blob: 58d5da90125ba0263ffdc33279937b5cc104fafb [file] [log] [blame]
Ed Warnickecb9cada2015-12-08 15:45:58 -07001/*
2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15/*
16 Copyright (c) 2005 Eliot Dresselhaus
17
18 Permission is hereby granted, free of charge, to any person obtaining
19 a copy of this software and associated documentation files (the
20 "Software"), to deal in the Software without restriction, including
21 without limitation the rights to use, copy, modify, merge, publish,
22 distribute, sublicense, and/or sell copies of the Software, and to
23 permit persons to whom the Software is furnished to do so, subject to
24 the following conditions:
25
26 The above copyright notice and this permission notice shall be
27 included in all copies or substantial portions of the Software.
28
29 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
33 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
34 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
35 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
36*/
37
38#ifndef included_vector_sse2_h
39#define included_vector_sse2_h
40
Dave Barachc3799992016-08-15 11:12:27 -040041#include <vppinfra/error_bootstrap.h> /* for ASSERT */
Ed Warnickecb9cada2015-12-08 15:45:58 -070042#include <x86intrin.h>
43
Damjan Mariona52e1662018-05-19 00:04:23 +020044#define foreach_sse42_vec128i \
45 _(i,8,16,epi8) _(i,16,8,epi16) _(i,32,4,epi32) _(i,64,2,epi64x)
46#define foreach_sse42_vec128u \
47 _(u,8,16,epi8) _(u,16,8,epi16) _(u,32,4,epi32) _(u,64,2,epi64x)
48#define foreach_sse42_vec128f \
49 _(f,32,4,ps) _(f,64,2,pd)
50
51/* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
52 is_all_equal */
53#define _(t, s, c, i) \
54static_always_inline t##s##x##c \
55t##s##x##c##_splat (t##s x) \
56{ return (t##s##x##c) _mm_set1_##i (x); } \
57\
58static_always_inline t##s##x##c \
59t##s##x##c##_load_unaligned (void *p) \
60{ return (t##s##x##c) _mm_loadu_si128 (p); } \
61\
62static_always_inline void \
63t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
64{ _mm_storeu_si128 ((__m128i *) p, (__m128i) v); } \
65\
66static_always_inline int \
67t##s##x##c##_is_all_zero (t##s##x##c x) \
68{ return _mm_testz_si128 ((__m128i) x, (__m128i) x); } \
69\
70static_always_inline int \
Damjan Marion14864772018-05-22 14:07:47 +020071t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
72{ return t##s##x##c##_is_all_zero (a ^ b); } \
Damjan Mariona52e1662018-05-19 00:04:23 +020073\
74static_always_inline int \
75t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
76{ return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); }; \
77
78foreach_sse42_vec128i foreach_sse42_vec128u
79#undef _
Damjan Marionf6adf1f2019-04-02 19:06:50 +020080
81/* min, max */
82#define _(t, s, c, i) \
83static_always_inline t##s##x##c \
84t##s##x##c##_min (t##s##x##c a, t##s##x##c b) \
85{ return (t##s##x##c) _mm_min_##i ((__m128i) a, (__m128i) b); } \
86\
87static_always_inline t##s##x##c \
88t##s##x##c##_max (t##s##x##c a, t##s##x##c b) \
89{ return (t##s##x##c) _mm_max_##i ((__m128i) a, (__m128i) b); } \
90
91_(i,8,16,epi8) _(i,16,8,epi16) _(i,32,4,epi32) _(i,64,2,epi64)
92_(u,8,16,epu8) _(u,16,8,epu16) _(u,32,4,epu32) _(u,64,2,epu64)
93#undef _
Damjan Mariona52e1662018-05-19 00:04:23 +020094
95#define CLIB_VEC128_SPLAT_DEFINED
96#define CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE
97
Ed Warnickecb9cada2015-12-08 15:45:58 -070098/* 128 bit interleaves. */
Dave Barachc3799992016-08-15 11:12:27 -040099always_inline u8x16
100u8x16_interleave_hi (u8x16 a, u8x16 b)
101{
102 return (u8x16) _mm_unpackhi_epi8 ((__m128i) a, (__m128i) b);
103}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700104
Dave Barachc3799992016-08-15 11:12:27 -0400105always_inline u8x16
106u8x16_interleave_lo (u8x16 a, u8x16 b)
107{
108 return (u8x16) _mm_unpacklo_epi8 ((__m128i) a, (__m128i) b);
109}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700110
Dave Barachc3799992016-08-15 11:12:27 -0400111always_inline u16x8
112u16x8_interleave_hi (u16x8 a, u16x8 b)
113{
114 return (u16x8) _mm_unpackhi_epi16 ((__m128i) a, (__m128i) b);
115}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700116
Dave Barachc3799992016-08-15 11:12:27 -0400117always_inline u16x8
118u16x8_interleave_lo (u16x8 a, u16x8 b)
119{
120 return (u16x8) _mm_unpacklo_epi16 ((__m128i) a, (__m128i) b);
121}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700122
Dave Barachc3799992016-08-15 11:12:27 -0400123always_inline u32x4
124u32x4_interleave_hi (u32x4 a, u32x4 b)
125{
126 return (u32x4) _mm_unpackhi_epi32 ((__m128i) a, (__m128i) b);
127}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700128
Dave Barachc3799992016-08-15 11:12:27 -0400129always_inline u32x4
130u32x4_interleave_lo (u32x4 a, u32x4 b)
131{
132 return (u32x4) _mm_unpacklo_epi32 ((__m128i) a, (__m128i) b);
133}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700134
Dave Barachc3799992016-08-15 11:12:27 -0400135always_inline u64x2
136u64x2_interleave_hi (u64x2 a, u64x2 b)
137{
138 return (u64x2) _mm_unpackhi_epi64 ((__m128i) a, (__m128i) b);
139}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700140
Dave Barachc3799992016-08-15 11:12:27 -0400141always_inline u64x2
142u64x2_interleave_lo (u64x2 a, u64x2 b)
143{
144 return (u64x2) _mm_unpacklo_epi64 ((__m128i) a, (__m128i) b);
145}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700146
Ed Warnickecb9cada2015-12-08 15:45:58 -0700147/* 128 bit packs. */
Damjan Marion7d14aad2021-05-05 19:31:41 +0200148#define _(f, t, fn) \
149 always_inline t t##_pack (f lo, f hi) \
150 { \
151 return (t) fn ((__m128i) lo, (__m128i) hi); \
152 }
Ed Warnickecb9cada2015-12-08 15:45:58 -0700153
Damjan Marion7d14aad2021-05-05 19:31:41 +0200154_ (i16x8, i8x16, _mm_packs_epi16)
155_ (i16x8, u8x16, _mm_packus_epi16)
156_ (i32x4, i16x8, _mm_packs_epi32)
157_ (i32x4, u16x8, _mm_packus_epi32)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700158
Damjan Marion7d14aad2021-05-05 19:31:41 +0200159#undef _
Ed Warnickecb9cada2015-12-08 15:45:58 -0700160
Ed Warnickecb9cada2015-12-08 15:45:58 -0700161#define _signed_binop(n,m,f,g) \
162 /* Unsigned */ \
163 always_inline u##n##x##m \
164 u##n##x##m##_##f (u##n##x##m x, u##n##x##m y) \
165 { return (u##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); } \
166 \
167 /* Signed */ \
168 always_inline i##n##x##m \
169 i##n##x##m##_##f (i##n##x##m x, i##n##x##m y) \
170 { return (i##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); }
Ed Warnickecb9cada2015-12-08 15:45:58 -0700171/* Addition/subtraction with saturation. */
Damjan Mariona52e1662018-05-19 00:04:23 +0200172_signed_binop (8, 16, add_saturate, adds_epu)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700173_signed_binop (16, 8, add_saturate, adds_epu)
174_signed_binop (8, 16, sub_saturate, subs_epu)
175_signed_binop (16, 8, sub_saturate, subs_epu)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700176/* Multiplication. */
Dave Barachc3799992016-08-15 11:12:27 -0400177 always_inline i16x8 i16x8_mul_lo (i16x8 x, i16x8 y)
178{
179 return (i16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
180}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700181
Dave Barachc3799992016-08-15 11:12:27 -0400182always_inline u16x8
183u16x8_mul_lo (u16x8 x, u16x8 y)
184{
185 return (u16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
186}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700187
Dave Barachc3799992016-08-15 11:12:27 -0400188always_inline i16x8
189i16x8_mul_hi (i16x8 x, i16x8 y)
190{
191 return (i16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
192}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700193
Dave Barachc3799992016-08-15 11:12:27 -0400194always_inline u16x8
195u16x8_mul_hi (u16x8 x, u16x8 y)
196{
197 return (u16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
198}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700199
200/* 128 bit shifts. */
201
202#define _(p,a,b,c,f) \
203 always_inline p##a##x##b p##a##x##b##_ishift_##c (p##a##x##b x, int i) \
204 { return (p##a##x##b) _mm_##f##i_epi##a ((__m128i) x, i); } \
205 \
206 always_inline p##a##x##b p##a##x##b##_shift_##c (p##a##x##b x, p##a##x##b y) \
207 { return (p##a##x##b) _mm_##f##_epi##a ((__m128i) x, (__m128i) y); }
208
Dave Barachc3799992016-08-15 11:12:27 -0400209_(u, 16, 8, left, sll)
210_(u, 32, 4, left, sll)
211_(u, 64, 2, left, sll)
212_(u, 16, 8, right, srl)
213_(u, 32, 4, right, srl)
214_(u, 64, 2, right, srl)
215_(i, 16, 8, left, sll)
216_(i, 32, 4, left, sll)
217_(i, 64, 2, left, sll) _(i, 16, 8, right, sra) _(i, 32, 4, right, sra)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700218#undef _
Ed Warnickecb9cada2015-12-08 15:45:58 -0700219
220#define u8x16_word_shift_left(a,n) (u8x16) _mm_slli_si128((__m128i) a, n)
221#define u8x16_word_shift_right(a,n) (u8x16) _mm_srli_si128((__m128i) a, n)
222
223#define i8x16_word_shift_left(a,n) \
224 ((i8x16) u8x16_word_shift_left((u8x16) (a), (n)))
225#define i8x16_word_shift_right(a,n) \
226 ((i8x16) u8x16_word_shift_right((u8x16) (a), (n)))
227
228#define u16x8_word_shift_left(a,n) \
229 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
230#define i16x8_word_shift_left(a,n) \
231 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
232#define u16x8_word_shift_right(a,n) \
233 ((u16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
234#define i16x8_word_shift_right(a,n) \
235 ((i16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
236
237#define u32x4_word_shift_left(a,n) \
238 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
239#define i32x4_word_shift_left(a,n) \
240 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
241#define u32x4_word_shift_right(a,n) \
242 ((u32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
243#define i32x4_word_shift_right(a,n) \
244 ((i32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
245
246#define u64x2_word_shift_left(a,n) \
247 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
248#define i64x2_word_shift_left(a,n) \
249 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
250#define u64x2_word_shift_right(a,n) \
251 ((u64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
252#define i64x2_word_shift_right(a,n) \
253 ((i64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
254
255/* SSE2 has no rotate instructions: use shifts to simulate them. */
256#define _(t,n,lr1,lr2) \
257 always_inline t##x##n \
258 t##x##n##_irotate_##lr1 (t##x##n w, int i) \
259 { \
260 ASSERT (i >= 0 && i <= BITS (t)); \
261 return (t##x##n##_ishift_##lr1 (w, i) \
262 | t##x##n##_ishift_##lr2 (w, BITS (t) - i)); \
263 } \
264 \
265 always_inline t##x##n \
266 t##x##n##_rotate_##lr1 (t##x##n w, t##x##n i) \
267 { \
268 t##x##n j = t##x##n##_splat (BITS (t)); \
269 return (t##x##n##_shift_##lr1 (w, i) \
270 | t##x##n##_shift_##lr2 (w, j - i)); \
271 }
272
Dave Barachc3799992016-08-15 11:12:27 -0400273_(u16, 8, left, right);
274_(u16, 8, right, left);
275_(u32, 4, left, right);
276_(u32, 4, right, left);
277_(u64, 2, left, right);
278_(u64, 2, right, left);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700279
280#undef _
281
Dave Barachc3799992016-08-15 11:12:27 -0400282always_inline u32
283u8x16_max_scalar (u8x16 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700284{
285 x = u8x16_max (x, u8x16_word_shift_right (x, 8));
286 x = u8x16_max (x, u8x16_word_shift_right (x, 4));
287 x = u8x16_max (x, u8x16_word_shift_right (x, 2));
288 x = u8x16_max (x, u8x16_word_shift_right (x, 1));
289 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
290}
291
Dave Barachc3799992016-08-15 11:12:27 -0400292always_inline u8
293u8x16_min_scalar (u8x16 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700294{
295 x = u8x16_min (x, u8x16_word_shift_right (x, 8));
296 x = u8x16_min (x, u8x16_word_shift_right (x, 4));
297 x = u8x16_min (x, u8x16_word_shift_right (x, 2));
298 x = u8x16_min (x, u8x16_word_shift_right (x, 1));
299 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
300}
301
Dave Barachc3799992016-08-15 11:12:27 -0400302always_inline i16
303i16x8_max_scalar (i16x8 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700304{
305 x = i16x8_max (x, i16x8_word_shift_right (x, 4));
306 x = i16x8_max (x, i16x8_word_shift_right (x, 2));
307 x = i16x8_max (x, i16x8_word_shift_right (x, 1));
308 return _mm_extract_epi16 ((__m128i) x, 0);
309}
310
Dave Barachc3799992016-08-15 11:12:27 -0400311always_inline i16
312i16x8_min_scalar (i16x8 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700313{
314 x = i16x8_min (x, i16x8_word_shift_right (x, 4));
315 x = i16x8_min (x, i16x8_word_shift_right (x, 2));
316 x = i16x8_min (x, i16x8_word_shift_right (x, 1));
317 return _mm_extract_epi16 ((__m128i) x, 0);
318}
319
Damjan Marionf6adf1f2019-04-02 19:06:50 +0200320#define u8x16_align_right(a, b, imm) \
321 (u8x16) _mm_alignr_epi8 ((__m128i) a, (__m128i) b, imm)
322
323static_always_inline u32
324u32x4_min_scalar (u32x4 v)
325{
326 v = u32x4_min (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
327 v = u32x4_min (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
328 return v[0];
329}
330
331static_always_inline u32
332u32x4_max_scalar (u32x4 v)
333{
334 v = u32x4_max (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
335 v = u32x4_max (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
336 return v[0];
337}
338
339static_always_inline u32
340i32x4_min_scalar (i32x4 v)
341{
342 v = i32x4_min (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
343 v = i32x4_min (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
344 return v[0];
345}
346
347static_always_inline u32
348i32x4_max_scalar (i32x4 v)
349{
350 v = i32x4_max (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
351 v = i32x4_max (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
352 return v[0];
353}
354
Damjan Marion8c3f8a22018-05-17 21:12:13 +0200355static_always_inline u16
356u8x16_msb_mask (u8x16 v)
357{
358 return _mm_movemask_epi8 ((__m128i) v);
359}
360
Damjan Marionef0bac72021-04-22 18:08:28 +0200361static_always_inline u16
362i8x16_msb_mask (i8x16 v)
363{
364 return _mm_movemask_epi8 ((__m128i) v);
365}
366
Damjan Marion398fdc12018-05-19 10:27:10 +0200367#define CLIB_HAVE_VEC128_MSB_MASK
368
Ed Warnickecb9cada2015-12-08 15:45:58 -0700369#undef _signed_binop
370
Damjan Marioncf18ca92019-04-13 00:13:34 +0200371static_always_inline u32x4
372u32x4_byte_swap (u32x4 v)
373{
374 u8x16 swap = {
375 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
376 };
377 return (u32x4) _mm_shuffle_epi8 ((__m128i) v, (__m128i) swap);
378}
379
Damjan Marionbf129f42018-06-27 13:03:26 +0200380static_always_inline u16x8
381u16x8_byte_swap (u16x8 v)
382{
383 u8x16 swap = {
384 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
385 };
386 return (u16x8) _mm_shuffle_epi8 ((__m128i) v, (__m128i) swap);
387}
388
Damjan Marion622b5ce2020-02-12 10:59:14 +0100389static_always_inline u8x16
390u8x16_reflect (u8x16 v)
391{
392 u8x16 mask = {
393 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
394 };
395 return (u8x16) _mm_shuffle_epi8 ((__m128i) v, (__m128i) mask);
396}
397
Damjan Marionbf129f42018-06-27 13:03:26 +0200398static_always_inline u32x4
399u32x4_hadd (u32x4 v1, u32x4 v2)
400{
401 return (u32x4) _mm_hadd_epi32 ((__m128i) v1, (__m128i) v2);
402}
403
Damjan Marion94dbf952020-07-15 20:18:39 +0200404static_always_inline u32 __clib_unused
405u32x4_sum_elts (u32x4 sum4)
406{
407 sum4 += (u32x4) u8x16_align_right (sum4, sum4, 8);
408 sum4 += (u32x4) u8x16_align_right (sum4, sum4, 4);
409 return sum4[0];
410}
411
Damjan Marion90d05bc2020-08-31 17:18:26 +0200412/* _from_ */
Damjan Marion5df580e2018-07-27 01:47:57 +0200413#define _(f,t,i) \
414static_always_inline t \
Damjan Marion90d05bc2020-08-31 17:18:26 +0200415t##_from_##f (f x) \
Damjan Marion5df580e2018-07-27 01:47:57 +0200416{ return (t) _mm_cvt##i ((__m128i) x); }
417
418_(u8x16, u16x8, epu8_epi16)
419_(u8x16, u32x4, epu8_epi32)
420_(u8x16, u64x2, epu8_epi64)
421_(u16x8, u32x4, epu16_epi32)
422_(u16x8, u64x2, epu16_epi64)
423_(u32x4, u64x2, epu32_epi64)
424
425_(i8x16, i16x8, epi8_epi16)
426_(i8x16, i32x4, epi8_epi32)
427_(i8x16, i64x2, epi8_epi64)
428_(i16x8, i32x4, epi16_epi32)
429_(i16x8, i64x2, epi16_epi64)
430_(i32x4, i64x2, epi32_epi64)
431#undef _
Damjan Marion5df580e2018-07-27 01:47:57 +0200432
Damjan Marion07243572018-11-20 10:06:57 +0100433static_always_inline u64x2
434u64x2_gather (void *p0, void *p1)
435{
436 u64x2 r = { *(u64 *) p0, *(u64 *) p1 };
437 return r;
438}
439
440static_always_inline u32x4
Damjan Marion97b9e002020-06-23 19:01:56 +0200441u32x4_gather (void *p0, void *p1, void *p2, void *p3)
Damjan Marion07243572018-11-20 10:06:57 +0100442{
443 u32x4 r = { *(u32 *) p0, *(u32 *) p1, *(u32 *) p2, *(u32 *) p3 };
444 return r;
445}
446
447
448static_always_inline void
449u64x2_scatter (u64x2 r, void *p0, void *p1)
450{
451 *(u64 *) p0 = r[0];
452 *(u64 *) p1 = r[1];
453}
454
455static_always_inline void
456u32x4_scatter (u32x4 r, void *p0, void *p1, void *p2, void *p3)
457{
458 *(u32 *) p0 = r[0];
459 *(u32 *) p1 = r[1];
460 *(u32 *) p2 = r[2];
461 *(u32 *) p3 = r[3];
462}
463
464static_always_inline void
465u64x2_scatter_one (u64x2 r, int index, void *p)
466{
467 *(u64 *) p = r[index];
468}
469
470static_always_inline void
471u32x4_scatter_one (u32x4 r, int index, void *p)
472{
473 *(u32 *) p = r[index];
474}
475
Damjan Marionc59b9a22019-03-19 15:38:40 +0100476static_always_inline u8x16
Damjan Marionc59b9a22019-03-19 15:38:40 +0100477u8x16_blend (u8x16 v1, u8x16 v2, u8x16 mask)
478{
479 return (u8x16) _mm_blendv_epi8 ((__m128i) v1, (__m128i) v2, (__m128i) mask);
480}
481
Damjan Marionf75defa2020-02-13 18:14:06 +0100482static_always_inline u8x16
483u8x16_xor3 (u8x16 a, u8x16 b, u8x16 c)
484{
485#if __AVX512F__
486 return (u8x16) _mm_ternarylogic_epi32 ((__m128i) a, (__m128i) b,
487 (__m128i) c, 0x96);
488#endif
489 return a ^ b ^ c;
490}
Damjan Marion5df580e2018-07-27 01:47:57 +0200491
Damjan Marionb47376f2023-03-15 11:42:06 +0000492static_always_inline u8x16
493u8x16_load_partial (u8 *data, uword n)
494{
495 u8x16 r = {};
496#if defined(CLIB_HAVE_VEC128_MASK_LOAD_STORE)
497 return u8x16_mask_load_zero (data, pow2_mask (n));
498#endif
499 if (n > 7)
500 {
501 u64x2 r;
502 r[1] = *(u64u *) (data + n - 8);
503 r >>= (16 - n) * 8;
504 r[0] = *(u64u *) data;
505 return (u8x16) r;
506 }
507 else if (n > 3)
508 {
509 u32x4 r = {};
510 r[1] = *(u32u *) (data + n - 4);
511 r >>= (8 - n) * 8;
512 r[0] = *(u32u *) data;
513 return (u8x16) r;
514 }
515 else if (n > 1)
516 {
517 u16x8 r = {};
518 r[1] = *(u16u *) (data + n - 2);
519 r >>= (4 - n) * 8;
520 r[0] = *(u16u *) data;
521 return (u8x16) r;
522 }
523 else if (n > 0)
524 r[0] = *data;
525 return r;
526}
527
528static_always_inline void
529u8x16_store_partial (u8x16 r, u8 *data, uword n)
530{
531#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
532 u8x16_mask_store (r, data, pow2_mask (n));
533#else
534 if (n > 7)
535 {
536 *(u64u *) (data + n - 8) = ((u64x2) r)[1] << ((16 - n) * 8);
537 *(u64u *) data = ((u64x2) r)[0];
538 }
539 else if (n > 3)
540 {
541 *(u32u *) (data + n - 4) = ((u32x4) r)[1] << ((8 - n) * 8);
542 *(u32u *) data = ((u32x4) r)[0];
543 }
544 else if (n > 1)
545 {
546 *(u16u *) (data + n - 2) = ((u16x8) r)[1] << ((4 - n) * 8);
547 *(u16u *) data = ((u16x8) r)[0];
548 }
549 else if (n > 0)
550 data[0] = r[0];
551#endif
552}
553
Ed Warnickecb9cada2015-12-08 15:45:58 -0700554#endif /* included_vector_sse2_h */
Dave Barachc3799992016-08-15 11:12:27 -0400555
556/*
557 * fd.io coding-style-patch-verification: ON
558 *
559 * Local Variables:
560 * eval: (c-set-style "gnu")
561 * End:
562 */