blob: 1bdb34b866e22dba3e222a39cdc648c0b6e48a1e [file] [log] [blame]
Ed Warnickecb9cada2015-12-08 15:45:58 -07001/*
2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15/*
16 Copyright (c) 2005 Eliot Dresselhaus
17
18 Permission is hereby granted, free of charge, to any person obtaining
19 a copy of this software and associated documentation files (the
20 "Software"), to deal in the Software without restriction, including
21 without limitation the rights to use, copy, modify, merge, publish,
22 distribute, sublicense, and/or sell copies of the Software, and to
23 permit persons to whom the Software is furnished to do so, subject to
24 the following conditions:
25
26 The above copyright notice and this permission notice shall be
27 included in all copies or substantial portions of the Software.
28
29 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
33 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
34 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
35 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
36*/
37
38#ifndef included_vector_sse2_h
39#define included_vector_sse2_h
40
Dave Barachc3799992016-08-15 11:12:27 -040041#include <vppinfra/error_bootstrap.h> /* for ASSERT */
Ed Warnickecb9cada2015-12-08 15:45:58 -070042#include <x86intrin.h>
43
Damjan Mariona52e1662018-05-19 00:04:23 +020044/* *INDENT-OFF* */
45#define foreach_sse42_vec128i \
46 _(i,8,16,epi8) _(i,16,8,epi16) _(i,32,4,epi32) _(i,64,2,epi64x)
47#define foreach_sse42_vec128u \
48 _(u,8,16,epi8) _(u,16,8,epi16) _(u,32,4,epi32) _(u,64,2,epi64x)
49#define foreach_sse42_vec128f \
50 _(f,32,4,ps) _(f,64,2,pd)
51
52/* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
53 is_all_equal */
54#define _(t, s, c, i) \
55static_always_inline t##s##x##c \
56t##s##x##c##_splat (t##s x) \
57{ return (t##s##x##c) _mm_set1_##i (x); } \
58\
59static_always_inline t##s##x##c \
60t##s##x##c##_load_unaligned (void *p) \
61{ return (t##s##x##c) _mm_loadu_si128 (p); } \
62\
63static_always_inline void \
64t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
65{ _mm_storeu_si128 ((__m128i *) p, (__m128i) v); } \
66\
67static_always_inline int \
68t##s##x##c##_is_all_zero (t##s##x##c x) \
69{ return _mm_testz_si128 ((__m128i) x, (__m128i) x); } \
70\
71static_always_inline int \
Damjan Marion14864772018-05-22 14:07:47 +020072t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
73{ return t##s##x##c##_is_all_zero (a ^ b); } \
Damjan Mariona52e1662018-05-19 00:04:23 +020074\
75static_always_inline int \
76t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
77{ return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); }; \
78
79foreach_sse42_vec128i foreach_sse42_vec128u
80#undef _
Damjan Marionf6adf1f2019-04-02 19:06:50 +020081
82/* min, max */
83#define _(t, s, c, i) \
84static_always_inline t##s##x##c \
85t##s##x##c##_min (t##s##x##c a, t##s##x##c b) \
86{ return (t##s##x##c) _mm_min_##i ((__m128i) a, (__m128i) b); } \
87\
88static_always_inline t##s##x##c \
89t##s##x##c##_max (t##s##x##c a, t##s##x##c b) \
90{ return (t##s##x##c) _mm_max_##i ((__m128i) a, (__m128i) b); } \
91
92_(i,8,16,epi8) _(i,16,8,epi16) _(i,32,4,epi32) _(i,64,2,epi64)
93_(u,8,16,epu8) _(u,16,8,epu16) _(u,32,4,epu32) _(u,64,2,epu64)
94#undef _
Damjan Mariona52e1662018-05-19 00:04:23 +020095/* *INDENT-ON* */
96
97#define CLIB_VEC128_SPLAT_DEFINED
98#define CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE
99
Ed Warnickecb9cada2015-12-08 15:45:58 -0700100/* 128 bit interleaves. */
Dave Barachc3799992016-08-15 11:12:27 -0400101always_inline u8x16
102u8x16_interleave_hi (u8x16 a, u8x16 b)
103{
104 return (u8x16) _mm_unpackhi_epi8 ((__m128i) a, (__m128i) b);
105}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700106
Dave Barachc3799992016-08-15 11:12:27 -0400107always_inline u8x16
108u8x16_interleave_lo (u8x16 a, u8x16 b)
109{
110 return (u8x16) _mm_unpacklo_epi8 ((__m128i) a, (__m128i) b);
111}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700112
Dave Barachc3799992016-08-15 11:12:27 -0400113always_inline u16x8
114u16x8_interleave_hi (u16x8 a, u16x8 b)
115{
116 return (u16x8) _mm_unpackhi_epi16 ((__m128i) a, (__m128i) b);
117}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700118
Dave Barachc3799992016-08-15 11:12:27 -0400119always_inline u16x8
120u16x8_interleave_lo (u16x8 a, u16x8 b)
121{
122 return (u16x8) _mm_unpacklo_epi16 ((__m128i) a, (__m128i) b);
123}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700124
Dave Barachc3799992016-08-15 11:12:27 -0400125always_inline u32x4
126u32x4_interleave_hi (u32x4 a, u32x4 b)
127{
128 return (u32x4) _mm_unpackhi_epi32 ((__m128i) a, (__m128i) b);
129}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700130
Dave Barachc3799992016-08-15 11:12:27 -0400131always_inline u32x4
132u32x4_interleave_lo (u32x4 a, u32x4 b)
133{
134 return (u32x4) _mm_unpacklo_epi32 ((__m128i) a, (__m128i) b);
135}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700136
Dave Barachc3799992016-08-15 11:12:27 -0400137always_inline u64x2
138u64x2_interleave_hi (u64x2 a, u64x2 b)
139{
140 return (u64x2) _mm_unpackhi_epi64 ((__m128i) a, (__m128i) b);
141}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700142
Dave Barachc3799992016-08-15 11:12:27 -0400143always_inline u64x2
144u64x2_interleave_lo (u64x2 a, u64x2 b)
145{
146 return (u64x2) _mm_unpacklo_epi64 ((__m128i) a, (__m128i) b);
147}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700148
149/* 64 bit interleaves. */
Dave Barachc3799992016-08-15 11:12:27 -0400150always_inline u8x8
151u8x8_interleave_hi (u8x8 a, u8x8 b)
152{
153 return (u8x8) _m_punpckhbw ((__m64) a, (__m64) b);
154}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700155
Dave Barachc3799992016-08-15 11:12:27 -0400156always_inline u8x8
157u8x8_interleave_lo (u8x8 a, u8x8 b)
158{
159 return (u8x8) _m_punpcklbw ((__m64) a, (__m64) b);
160}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700161
Dave Barachc3799992016-08-15 11:12:27 -0400162always_inline u16x4
163u16x4_interleave_hi (u16x4 a, u16x4 b)
164{
165 return (u16x4) _m_punpckhwd ((__m64) a, (__m64) b);
166}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700167
Dave Barachc3799992016-08-15 11:12:27 -0400168always_inline u16x4
169u16x4_interleave_lo (u16x4 a, u16x4 b)
170{
171 return (u16x4) _m_punpcklwd ((__m64) a, (__m64) b);
172}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700173
Dave Barachc3799992016-08-15 11:12:27 -0400174always_inline u32x2
175u32x2_interleave_hi (u32x2 a, u32x2 b)
176{
177 return (u32x2) _m_punpckhdq ((__m64) a, (__m64) b);
178}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700179
Dave Barachc3799992016-08-15 11:12:27 -0400180always_inline u32x2
181u32x2_interleave_lo (u32x2 a, u32x2 b)
182{
183 return (u32x2) _m_punpckldq ((__m64) a, (__m64) b);
184}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700185
186/* 128 bit packs. */
Damjan Marion7d14aad2021-05-05 19:31:41 +0200187#define _(f, t, fn) \
188 always_inline t t##_pack (f lo, f hi) \
189 { \
190 return (t) fn ((__m128i) lo, (__m128i) hi); \
191 }
Ed Warnickecb9cada2015-12-08 15:45:58 -0700192
Damjan Marion7d14aad2021-05-05 19:31:41 +0200193_ (i16x8, i8x16, _mm_packs_epi16)
194_ (i16x8, u8x16, _mm_packus_epi16)
195_ (i32x4, i16x8, _mm_packs_epi32)
196_ (i32x4, u16x8, _mm_packus_epi32)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700197
Damjan Marion7d14aad2021-05-05 19:31:41 +0200198#undef _
Ed Warnickecb9cada2015-12-08 15:45:58 -0700199
Ed Warnickecb9cada2015-12-08 15:45:58 -0700200#define _signed_binop(n,m,f,g) \
201 /* Unsigned */ \
202 always_inline u##n##x##m \
203 u##n##x##m##_##f (u##n##x##m x, u##n##x##m y) \
204 { return (u##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); } \
205 \
206 /* Signed */ \
207 always_inline i##n##x##m \
208 i##n##x##m##_##f (i##n##x##m x, i##n##x##m y) \
209 { return (i##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); }
Ed Warnickecb9cada2015-12-08 15:45:58 -0700210/* Addition/subtraction with saturation. */
Damjan Mariona52e1662018-05-19 00:04:23 +0200211_signed_binop (8, 16, add_saturate, adds_epu)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700212_signed_binop (16, 8, add_saturate, adds_epu)
213_signed_binop (8, 16, sub_saturate, subs_epu)
214_signed_binop (16, 8, sub_saturate, subs_epu)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700215/* Multiplication. */
Dave Barachc3799992016-08-15 11:12:27 -0400216 always_inline i16x8 i16x8_mul_lo (i16x8 x, i16x8 y)
217{
218 return (i16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
219}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700220
Dave Barachc3799992016-08-15 11:12:27 -0400221always_inline u16x8
222u16x8_mul_lo (u16x8 x, u16x8 y)
223{
224 return (u16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
225}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700226
Dave Barachc3799992016-08-15 11:12:27 -0400227always_inline i16x8
228i16x8_mul_hi (i16x8 x, i16x8 y)
229{
230 return (i16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
231}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700232
Dave Barachc3799992016-08-15 11:12:27 -0400233always_inline u16x8
234u16x8_mul_hi (u16x8 x, u16x8 y)
235{
236 return (u16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
237}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700238
239/* 128 bit shifts. */
240
241#define _(p,a,b,c,f) \
242 always_inline p##a##x##b p##a##x##b##_ishift_##c (p##a##x##b x, int i) \
243 { return (p##a##x##b) _mm_##f##i_epi##a ((__m128i) x, i); } \
244 \
245 always_inline p##a##x##b p##a##x##b##_shift_##c (p##a##x##b x, p##a##x##b y) \
246 { return (p##a##x##b) _mm_##f##_epi##a ((__m128i) x, (__m128i) y); }
247
Dave Barachc3799992016-08-15 11:12:27 -0400248_(u, 16, 8, left, sll)
249_(u, 32, 4, left, sll)
250_(u, 64, 2, left, sll)
251_(u, 16, 8, right, srl)
252_(u, 32, 4, right, srl)
253_(u, 64, 2, right, srl)
254_(i, 16, 8, left, sll)
255_(i, 32, 4, left, sll)
256_(i, 64, 2, left, sll) _(i, 16, 8, right, sra) _(i, 32, 4, right, sra)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700257#undef _
Ed Warnickecb9cada2015-12-08 15:45:58 -0700258/* 64 bit shifts. */
Dave Barachc3799992016-08-15 11:12:27 -0400259 always_inline u16x4
260u16x4_shift_left (u16x4 x, u16x4 i)
261{
262 return (u16x4) _m_psllw ((__m64) x, (__m64) i);
263};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700264
Dave Barachc3799992016-08-15 11:12:27 -0400265always_inline u32x2
266u32x2_shift_left (u32x2 x, u32x2 i)
267{
268 return (u32x2) _m_pslld ((__m64) x, (__m64) i);
269};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700270
Dave Barachc3799992016-08-15 11:12:27 -0400271always_inline u16x4
272u16x4_shift_right (u16x4 x, u16x4 i)
273{
274 return (u16x4) _m_psrlw ((__m64) x, (__m64) i);
275};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700276
Dave Barachc3799992016-08-15 11:12:27 -0400277always_inline u32x2
278u32x2_shift_right (u32x2 x, u32x2 i)
279{
280 return (u32x2) _m_psrld ((__m64) x, (__m64) i);
281};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700282
Dave Barachc3799992016-08-15 11:12:27 -0400283always_inline i16x4
284i16x4_shift_left (i16x4 x, i16x4 i)
285{
286 return (i16x4) _m_psllw ((__m64) x, (__m64) i);
287};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700288
Dave Barachc3799992016-08-15 11:12:27 -0400289always_inline i32x2
290i32x2_shift_left (i32x2 x, i32x2 i)
291{
292 return (i32x2) _m_pslld ((__m64) x, (__m64) i);
293};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700294
Dave Barachc3799992016-08-15 11:12:27 -0400295always_inline i16x4
296i16x4_shift_right (i16x4 x, i16x4 i)
297{
298 return (i16x4) _m_psraw ((__m64) x, (__m64) i);
299};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700300
Dave Barachc3799992016-08-15 11:12:27 -0400301always_inline i32x2
302i32x2_shift_right (i32x2 x, i32x2 i)
303{
304 return (i32x2) _m_psrad ((__m64) x, (__m64) i);
305};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700306
307#define u8x16_word_shift_left(a,n) (u8x16) _mm_slli_si128((__m128i) a, n)
308#define u8x16_word_shift_right(a,n) (u8x16) _mm_srli_si128((__m128i) a, n)
309
310#define i8x16_word_shift_left(a,n) \
311 ((i8x16) u8x16_word_shift_left((u8x16) (a), (n)))
312#define i8x16_word_shift_right(a,n) \
313 ((i8x16) u8x16_word_shift_right((u8x16) (a), (n)))
314
315#define u16x8_word_shift_left(a,n) \
316 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
317#define i16x8_word_shift_left(a,n) \
318 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
319#define u16x8_word_shift_right(a,n) \
320 ((u16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
321#define i16x8_word_shift_right(a,n) \
322 ((i16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
323
324#define u32x4_word_shift_left(a,n) \
325 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
326#define i32x4_word_shift_left(a,n) \
327 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
328#define u32x4_word_shift_right(a,n) \
329 ((u32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
330#define i32x4_word_shift_right(a,n) \
331 ((i32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
332
333#define u64x2_word_shift_left(a,n) \
334 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
335#define i64x2_word_shift_left(a,n) \
336 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
337#define u64x2_word_shift_right(a,n) \
338 ((u64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
339#define i64x2_word_shift_right(a,n) \
340 ((i64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
341
342/* SSE2 has no rotate instructions: use shifts to simulate them. */
343#define _(t,n,lr1,lr2) \
344 always_inline t##x##n \
345 t##x##n##_irotate_##lr1 (t##x##n w, int i) \
346 { \
347 ASSERT (i >= 0 && i <= BITS (t)); \
348 return (t##x##n##_ishift_##lr1 (w, i) \
349 | t##x##n##_ishift_##lr2 (w, BITS (t) - i)); \
350 } \
351 \
352 always_inline t##x##n \
353 t##x##n##_rotate_##lr1 (t##x##n w, t##x##n i) \
354 { \
355 t##x##n j = t##x##n##_splat (BITS (t)); \
356 return (t##x##n##_shift_##lr1 (w, i) \
357 | t##x##n##_shift_##lr2 (w, j - i)); \
358 }
359
Dave Barachc3799992016-08-15 11:12:27 -0400360_(u16, 8, left, right);
361_(u16, 8, right, left);
362_(u32, 4, left, right);
363_(u32, 4, right, left);
364_(u64, 2, left, right);
365_(u64, 2, right, left);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700366
367#undef _
368
369#ifndef __clang__
370#define _(t,n,lr1,lr2) \
371 always_inline t##x##n \
372 t##x##n##_word_rotate2_##lr1 (t##x##n w0, t##x##n w1, int i) \
373 { \
374 int m = sizeof (t##x##n) / sizeof (t); \
375 ASSERT (i >= 0 && i < m); \
376 return (t##x##n##_word_shift_##lr1 (w0, i) \
377 | t##x##n##_word_shift_##lr2 (w1, m - i)); \
378 } \
379 \
380 always_inline t##x##n \
381 t##x##n##_word_rotate_##lr1 (t##x##n w0, int i) \
382 { return t##x##n##_word_rotate2_##lr1 (w0, w0, i); }
383
Dave Barachc3799992016-08-15 11:12:27 -0400384_(u8, 16, left, right);
385_(u8, 16, right, left);
386_(u16, 8, left, right);
387_(u16, 8, right, left);
388_(u32, 4, left, right);
389_(u32, 4, right, left);
390_(u64, 2, left, right);
391_(u64, 2, right, left);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700392
393#undef _
394#endif
395
Ed Warnickecb9cada2015-12-08 15:45:58 -0700396#define u32x4_select(A,MASK) \
397({ \
398 u32x4 _x, _y; \
399 _x = (A); \
400 asm volatile ("pshufd %[mask], %[x], %[y]" \
401 : /* outputs */ [y] "=x" (_y) \
402 : /* inputs */ [x] "x" (_x), [mask] "i" (MASK)); \
403 _y; \
404})
405
406#define u32x4_splat_word(x,i) \
407 u32x4_select ((x), (((i) << (2*0)) \
408 | ((i) << (2*1)) \
409 | ((i) << (2*2)) \
410 | ((i) << (2*3))))
411
412/* Extract low order 32 bit word. */
413always_inline u32
414u32x4_get0 (u32x4 x)
415{
416 u32 result;
Dave Barachc3799992016-08-15 11:12:27 -0400417 asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=r" (result)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700418 : /* inputs */ [x] "x" (x));
419 return result;
420}
421
422always_inline u32x4
423u32x4_set0 (u32 x)
424{
425 u32x4 result;
Dave Barachc3799992016-08-15 11:12:27 -0400426 asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=x" (result)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700427 : /* inputs */ [x] "r" (x));
428 return result;
429}
430
431always_inline i32x4
432i32x4_set0 (i32 x)
Dave Barachc3799992016-08-15 11:12:27 -0400433{
434 return (i32x4) u32x4_set0 ((u32) x);
435}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700436
437always_inline i32
438i32x4_get0 (i32x4 x)
Dave Barachc3799992016-08-15 11:12:27 -0400439{
440 return (i32) u32x4_get0 ((u32x4) x);
441}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700442
443/* Converts all ones/zeros compare mask to bitmap. */
Dave Barachc3799992016-08-15 11:12:27 -0400444always_inline u32
445u8x16_compare_byte_mask (u8x16 x)
446{
447 return _mm_movemask_epi8 ((__m128i) x);
448}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700449
Damjan Marionb8abf872016-03-14 20:02:35 +0100450extern u8 u32x4_compare_word_mask_table[256];
Ed Warnickecb9cada2015-12-08 15:45:58 -0700451
Dave Barachc3799992016-08-15 11:12:27 -0400452always_inline u32
453u32x4_compare_word_mask (u32x4 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700454{
455 u32 m = u8x16_compare_byte_mask ((u8x16) x);
456 return (u32x4_compare_word_mask_table[(m >> 0) & 0xff]
457 | (u32x4_compare_word_mask_table[(m >> 8) & 0xff] << 2));
458}
459
Dave Barachc3799992016-08-15 11:12:27 -0400460always_inline u32
461u8x16_zero_byte_mask (u8x16 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700462{
Dave Barachc3799992016-08-15 11:12:27 -0400463 u8x16 zero = { 0 };
Damjan Mariona52e1662018-05-19 00:04:23 +0200464 return u8x16_compare_byte_mask (x == zero);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700465}
466
Dave Barachc3799992016-08-15 11:12:27 -0400467always_inline u32
468u16x8_zero_byte_mask (u16x8 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700469{
Dave Barachc3799992016-08-15 11:12:27 -0400470 u16x8 zero = { 0 };
Damjan Mariona52e1662018-05-19 00:04:23 +0200471 return u8x16_compare_byte_mask ((u8x16) (x == zero));
Ed Warnickecb9cada2015-12-08 15:45:58 -0700472}
473
Dave Barachc3799992016-08-15 11:12:27 -0400474always_inline u32
475u32x4_zero_byte_mask (u32x4 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700476{
Dave Barachc3799992016-08-15 11:12:27 -0400477 u32x4 zero = { 0 };
Damjan Mariona52e1662018-05-19 00:04:23 +0200478 return u8x16_compare_byte_mask ((u8x16) (x == zero));
Ed Warnickecb9cada2015-12-08 15:45:58 -0700479}
480
Dave Barachc3799992016-08-15 11:12:27 -0400481always_inline u32
482u8x16_max_scalar (u8x16 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700483{
484 x = u8x16_max (x, u8x16_word_shift_right (x, 8));
485 x = u8x16_max (x, u8x16_word_shift_right (x, 4));
486 x = u8x16_max (x, u8x16_word_shift_right (x, 2));
487 x = u8x16_max (x, u8x16_word_shift_right (x, 1));
488 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
489}
490
Dave Barachc3799992016-08-15 11:12:27 -0400491always_inline u8
492u8x16_min_scalar (u8x16 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700493{
494 x = u8x16_min (x, u8x16_word_shift_right (x, 8));
495 x = u8x16_min (x, u8x16_word_shift_right (x, 4));
496 x = u8x16_min (x, u8x16_word_shift_right (x, 2));
497 x = u8x16_min (x, u8x16_word_shift_right (x, 1));
498 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
499}
500
Dave Barachc3799992016-08-15 11:12:27 -0400501always_inline i16
502i16x8_max_scalar (i16x8 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700503{
504 x = i16x8_max (x, i16x8_word_shift_right (x, 4));
505 x = i16x8_max (x, i16x8_word_shift_right (x, 2));
506 x = i16x8_max (x, i16x8_word_shift_right (x, 1));
507 return _mm_extract_epi16 ((__m128i) x, 0);
508}
509
Dave Barachc3799992016-08-15 11:12:27 -0400510always_inline i16
511i16x8_min_scalar (i16x8 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700512{
513 x = i16x8_min (x, i16x8_word_shift_right (x, 4));
514 x = i16x8_min (x, i16x8_word_shift_right (x, 2));
515 x = i16x8_min (x, i16x8_word_shift_right (x, 1));
516 return _mm_extract_epi16 ((__m128i) x, 0);
517}
518
Damjan Marionf6adf1f2019-04-02 19:06:50 +0200519#define u8x16_align_right(a, b, imm) \
520 (u8x16) _mm_alignr_epi8 ((__m128i) a, (__m128i) b, imm)
521
522static_always_inline u32
523u32x4_min_scalar (u32x4 v)
524{
525 v = u32x4_min (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
526 v = u32x4_min (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
527 return v[0];
528}
529
530static_always_inline u32
531u32x4_max_scalar (u32x4 v)
532{
533 v = u32x4_max (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
534 v = u32x4_max (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
535 return v[0];
536}
537
538static_always_inline u32
539i32x4_min_scalar (i32x4 v)
540{
541 v = i32x4_min (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
542 v = i32x4_min (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
543 return v[0];
544}
545
546static_always_inline u32
547i32x4_max_scalar (i32x4 v)
548{
549 v = i32x4_max (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
550 v = i32x4_max (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
551 return v[0];
552}
553
Damjan Marion8c3f8a22018-05-17 21:12:13 +0200554static_always_inline u16
555u8x16_msb_mask (u8x16 v)
556{
557 return _mm_movemask_epi8 ((__m128i) v);
558}
559
Damjan Marionef0bac72021-04-22 18:08:28 +0200560static_always_inline u16
561i8x16_msb_mask (i8x16 v)
562{
563 return _mm_movemask_epi8 ((__m128i) v);
564}
565
Damjan Marion398fdc12018-05-19 10:27:10 +0200566#define CLIB_HAVE_VEC128_MSB_MASK
567
Ed Warnickecb9cada2015-12-08 15:45:58 -0700568#undef _signed_binop
569
Damjan Marioncf18ca92019-04-13 00:13:34 +0200570static_always_inline u32x4
571u32x4_byte_swap (u32x4 v)
572{
573 u8x16 swap = {
574 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
575 };
576 return (u32x4) _mm_shuffle_epi8 ((__m128i) v, (__m128i) swap);
577}
578
Damjan Marionbf129f42018-06-27 13:03:26 +0200579static_always_inline u16x8
580u16x8_byte_swap (u16x8 v)
581{
582 u8x16 swap = {
583 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
584 };
585 return (u16x8) _mm_shuffle_epi8 ((__m128i) v, (__m128i) swap);
586}
587
Damjan Marion622b5ce2020-02-12 10:59:14 +0100588static_always_inline u8x16
589u8x16_reflect (u8x16 v)
590{
591 u8x16 mask = {
592 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
593 };
594 return (u8x16) _mm_shuffle_epi8 ((__m128i) v, (__m128i) mask);
595}
596
Damjan Marionbf129f42018-06-27 13:03:26 +0200597static_always_inline u32x4
598u32x4_hadd (u32x4 v1, u32x4 v2)
599{
600 return (u32x4) _mm_hadd_epi32 ((__m128i) v1, (__m128i) v2);
601}
602
Damjan Marion94dbf952020-07-15 20:18:39 +0200603static_always_inline u32 __clib_unused
604u32x4_sum_elts (u32x4 sum4)
605{
606 sum4 += (u32x4) u8x16_align_right (sum4, sum4, 8);
607 sum4 += (u32x4) u8x16_align_right (sum4, sum4, 4);
608 return sum4[0];
609}
610
Damjan Marion48a87f22018-07-26 10:25:58 +0200611static_always_inline u8x16
612u8x16_shuffle (u8x16 v, u8x16 m)
613{
614 return (u8x16) _mm_shuffle_epi8 ((__m128i) v, (__m128i) m);
615}
616
Damjan Marion5df580e2018-07-27 01:47:57 +0200617static_always_inline u32x4
618u32x4_shuffle (u32x4 v, const int a, const int b, const int c, const int d)
619{
Damjan Marion612dd6a2018-07-30 12:45:07 +0200620#if defined(__clang__) || !__OPTIMIZE__
Damjan Marion5df580e2018-07-27 01:47:57 +0200621 u32x4 r = { v[a], v[b], v[c], v[d] };
622 return r;
623#else
624 return (u32x4) _mm_shuffle_epi32 ((__m128i) v,
625 a | b << 2 | c << 4 | d << 6);
626#endif
627}
628
Damjan Marion90d05bc2020-08-31 17:18:26 +0200629/* _from_ */
Damjan Marion5df580e2018-07-27 01:47:57 +0200630/* *INDENT-OFF* */
631#define _(f,t,i) \
632static_always_inline t \
Damjan Marion90d05bc2020-08-31 17:18:26 +0200633t##_from_##f (f x) \
Damjan Marion5df580e2018-07-27 01:47:57 +0200634{ return (t) _mm_cvt##i ((__m128i) x); }
635
636_(u8x16, u16x8, epu8_epi16)
637_(u8x16, u32x4, epu8_epi32)
638_(u8x16, u64x2, epu8_epi64)
639_(u16x8, u32x4, epu16_epi32)
640_(u16x8, u64x2, epu16_epi64)
641_(u32x4, u64x2, epu32_epi64)
642
643_(i8x16, i16x8, epi8_epi16)
644_(i8x16, i32x4, epi8_epi32)
645_(i8x16, i64x2, epi8_epi64)
646_(i16x8, i32x4, epi16_epi32)
647_(i16x8, i64x2, epi16_epi64)
648_(i32x4, i64x2, epi32_epi64)
649#undef _
650/* *INDENT-ON* */
651
Damjan Marion07243572018-11-20 10:06:57 +0100652static_always_inline u64x2
653u64x2_gather (void *p0, void *p1)
654{
655 u64x2 r = { *(u64 *) p0, *(u64 *) p1 };
656 return r;
657}
658
659static_always_inline u32x4
Damjan Marion97b9e002020-06-23 19:01:56 +0200660u32x4_gather (void *p0, void *p1, void *p2, void *p3)
Damjan Marion07243572018-11-20 10:06:57 +0100661{
662 u32x4 r = { *(u32 *) p0, *(u32 *) p1, *(u32 *) p2, *(u32 *) p3 };
663 return r;
664}
665
666
667static_always_inline void
668u64x2_scatter (u64x2 r, void *p0, void *p1)
669{
670 *(u64 *) p0 = r[0];
671 *(u64 *) p1 = r[1];
672}
673
674static_always_inline void
675u32x4_scatter (u32x4 r, void *p0, void *p1, void *p2, void *p3)
676{
677 *(u32 *) p0 = r[0];
678 *(u32 *) p1 = r[1];
679 *(u32 *) p2 = r[2];
680 *(u32 *) p3 = r[3];
681}
682
683static_always_inline void
684u64x2_scatter_one (u64x2 r, int index, void *p)
685{
686 *(u64 *) p = r[index];
687}
688
689static_always_inline void
690u32x4_scatter_one (u32x4 r, int index, void *p)
691{
692 *(u32 *) p = r[index];
693}
694
Damjan Marionc59b9a22019-03-19 15:38:40 +0100695static_always_inline u8x16
696u8x16_is_greater (u8x16 v1, u8x16 v2)
697{
698 return (u8x16) _mm_cmpgt_epi8 ((__m128i) v1, (__m128i) v2);
699}
700
701static_always_inline u8x16
702u8x16_blend (u8x16 v1, u8x16 v2, u8x16 mask)
703{
704 return (u8x16) _mm_blendv_epi8 ((__m128i) v1, (__m128i) v2, (__m128i) mask);
705}
706
Damjan Marionf75defa2020-02-13 18:14:06 +0100707static_always_inline u8x16
708u8x16_xor3 (u8x16 a, u8x16 b, u8x16 c)
709{
710#if __AVX512F__
711 return (u8x16) _mm_ternarylogic_epi32 ((__m128i) a, (__m128i) b,
712 (__m128i) c, 0x96);
713#endif
714 return a ^ b ^ c;
715}
Damjan Marion5df580e2018-07-27 01:47:57 +0200716
Ed Warnickecb9cada2015-12-08 15:45:58 -0700717#endif /* included_vector_sse2_h */
Dave Barachc3799992016-08-15 11:12:27 -0400718
719/*
720 * fd.io coding-style-patch-verification: ON
721 *
722 * Local Variables:
723 * eval: (c-set-style "gnu")
724 * End:
725 */