blob: effab3fe487e5d5c671016d967e4afc7c4b9345f [file] [log] [blame]
Ed Warnickecb9cada2015-12-08 15:45:58 -07001/*
2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15/*
16 Copyright (c) 2005 Eliot Dresselhaus
17
18 Permission is hereby granted, free of charge, to any person obtaining
19 a copy of this software and associated documentation files (the
20 "Software"), to deal in the Software without restriction, including
21 without limitation the rights to use, copy, modify, merge, publish,
22 distribute, sublicense, and/or sell copies of the Software, and to
23 permit persons to whom the Software is furnished to do so, subject to
24 the following conditions:
25
26 The above copyright notice and this permission notice shall be
27 included in all copies or substantial portions of the Software.
28
29 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
33 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
34 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
35 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
36*/
37
38#ifndef included_vector_sse2_h
39#define included_vector_sse2_h
40
Dave Barachc3799992016-08-15 11:12:27 -040041#include <vppinfra/error_bootstrap.h> /* for ASSERT */
Ed Warnickecb9cada2015-12-08 15:45:58 -070042#include <x86intrin.h>
43
Damjan Mariona52e1662018-05-19 00:04:23 +020044/* *INDENT-OFF* */
45#define foreach_sse42_vec128i \
46 _(i,8,16,epi8) _(i,16,8,epi16) _(i,32,4,epi32) _(i,64,2,epi64x)
47#define foreach_sse42_vec128u \
48 _(u,8,16,epi8) _(u,16,8,epi16) _(u,32,4,epi32) _(u,64,2,epi64x)
49#define foreach_sse42_vec128f \
50 _(f,32,4,ps) _(f,64,2,pd)
51
52/* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
53 is_all_equal */
54#define _(t, s, c, i) \
55static_always_inline t##s##x##c \
56t##s##x##c##_splat (t##s x) \
57{ return (t##s##x##c) _mm_set1_##i (x); } \
58\
59static_always_inline t##s##x##c \
60t##s##x##c##_load_unaligned (void *p) \
61{ return (t##s##x##c) _mm_loadu_si128 (p); } \
62\
63static_always_inline void \
64t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
65{ _mm_storeu_si128 ((__m128i *) p, (__m128i) v); } \
66\
67static_always_inline int \
68t##s##x##c##_is_all_zero (t##s##x##c x) \
69{ return _mm_testz_si128 ((__m128i) x, (__m128i) x); } \
70\
71static_always_inline int \
Damjan Marion14864772018-05-22 14:07:47 +020072t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
73{ return t##s##x##c##_is_all_zero (a ^ b); } \
Damjan Mariona52e1662018-05-19 00:04:23 +020074\
75static_always_inline int \
76t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
77{ return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); }; \
78
79foreach_sse42_vec128i foreach_sse42_vec128u
80#undef _
Damjan Marionf6adf1f2019-04-02 19:06:50 +020081
82/* min, max */
83#define _(t, s, c, i) \
84static_always_inline t##s##x##c \
85t##s##x##c##_min (t##s##x##c a, t##s##x##c b) \
86{ return (t##s##x##c) _mm_min_##i ((__m128i) a, (__m128i) b); } \
87\
88static_always_inline t##s##x##c \
89t##s##x##c##_max (t##s##x##c a, t##s##x##c b) \
90{ return (t##s##x##c) _mm_max_##i ((__m128i) a, (__m128i) b); } \
91
92_(i,8,16,epi8) _(i,16,8,epi16) _(i,32,4,epi32) _(i,64,2,epi64)
93_(u,8,16,epu8) _(u,16,8,epu16) _(u,32,4,epu32) _(u,64,2,epu64)
94#undef _
Damjan Mariona52e1662018-05-19 00:04:23 +020095/* *INDENT-ON* */
96
97#define CLIB_VEC128_SPLAT_DEFINED
98#define CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE
99
Ed Warnickecb9cada2015-12-08 15:45:58 -0700100/* 128 bit interleaves. */
Dave Barachc3799992016-08-15 11:12:27 -0400101always_inline u8x16
102u8x16_interleave_hi (u8x16 a, u8x16 b)
103{
104 return (u8x16) _mm_unpackhi_epi8 ((__m128i) a, (__m128i) b);
105}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700106
Dave Barachc3799992016-08-15 11:12:27 -0400107always_inline u8x16
108u8x16_interleave_lo (u8x16 a, u8x16 b)
109{
110 return (u8x16) _mm_unpacklo_epi8 ((__m128i) a, (__m128i) b);
111}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700112
Dave Barachc3799992016-08-15 11:12:27 -0400113always_inline u16x8
114u16x8_interleave_hi (u16x8 a, u16x8 b)
115{
116 return (u16x8) _mm_unpackhi_epi16 ((__m128i) a, (__m128i) b);
117}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700118
Dave Barachc3799992016-08-15 11:12:27 -0400119always_inline u16x8
120u16x8_interleave_lo (u16x8 a, u16x8 b)
121{
122 return (u16x8) _mm_unpacklo_epi16 ((__m128i) a, (__m128i) b);
123}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700124
Dave Barachc3799992016-08-15 11:12:27 -0400125always_inline u32x4
126u32x4_interleave_hi (u32x4 a, u32x4 b)
127{
128 return (u32x4) _mm_unpackhi_epi32 ((__m128i) a, (__m128i) b);
129}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700130
Dave Barachc3799992016-08-15 11:12:27 -0400131always_inline u32x4
132u32x4_interleave_lo (u32x4 a, u32x4 b)
133{
134 return (u32x4) _mm_unpacklo_epi32 ((__m128i) a, (__m128i) b);
135}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700136
Dave Barachc3799992016-08-15 11:12:27 -0400137always_inline u64x2
138u64x2_interleave_hi (u64x2 a, u64x2 b)
139{
140 return (u64x2) _mm_unpackhi_epi64 ((__m128i) a, (__m128i) b);
141}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700142
Dave Barachc3799992016-08-15 11:12:27 -0400143always_inline u64x2
144u64x2_interleave_lo (u64x2 a, u64x2 b)
145{
146 return (u64x2) _mm_unpacklo_epi64 ((__m128i) a, (__m128i) b);
147}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700148
149/* 64 bit interleaves. */
Dave Barachc3799992016-08-15 11:12:27 -0400150always_inline u8x8
151u8x8_interleave_hi (u8x8 a, u8x8 b)
152{
153 return (u8x8) _m_punpckhbw ((__m64) a, (__m64) b);
154}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700155
Dave Barachc3799992016-08-15 11:12:27 -0400156always_inline u8x8
157u8x8_interleave_lo (u8x8 a, u8x8 b)
158{
159 return (u8x8) _m_punpcklbw ((__m64) a, (__m64) b);
160}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700161
Dave Barachc3799992016-08-15 11:12:27 -0400162always_inline u16x4
163u16x4_interleave_hi (u16x4 a, u16x4 b)
164{
165 return (u16x4) _m_punpckhwd ((__m64) a, (__m64) b);
166}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700167
Dave Barachc3799992016-08-15 11:12:27 -0400168always_inline u16x4
169u16x4_interleave_lo (u16x4 a, u16x4 b)
170{
171 return (u16x4) _m_punpcklwd ((__m64) a, (__m64) b);
172}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700173
Dave Barachc3799992016-08-15 11:12:27 -0400174always_inline u32x2
175u32x2_interleave_hi (u32x2 a, u32x2 b)
176{
177 return (u32x2) _m_punpckhdq ((__m64) a, (__m64) b);
178}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700179
Dave Barachc3799992016-08-15 11:12:27 -0400180always_inline u32x2
181u32x2_interleave_lo (u32x2 a, u32x2 b)
182{
183 return (u32x2) _m_punpckldq ((__m64) a, (__m64) b);
184}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700185
186/* 128 bit packs. */
Dave Barachc3799992016-08-15 11:12:27 -0400187always_inline u8x16
188u16x8_pack (u16x8 lo, u16x8 hi)
189{
190 return (u8x16) _mm_packus_epi16 ((__m128i) lo, (__m128i) hi);
191}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700192
Dave Barachc3799992016-08-15 11:12:27 -0400193always_inline i8x16
194i16x8_pack (i16x8 lo, i16x8 hi)
195{
196 return (i8x16) _mm_packs_epi16 ((__m128i) lo, (__m128i) hi);
197}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700198
Dave Barachc3799992016-08-15 11:12:27 -0400199always_inline u16x8
200u32x4_pack (u32x4 lo, u32x4 hi)
201{
202 return (u16x8) _mm_packs_epi32 ((__m128i) lo, (__m128i) hi);
203}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700204
205/* 64 bit packs. */
Dave Barachc3799992016-08-15 11:12:27 -0400206always_inline u8x8
207u16x4_pack (u16x4 lo, u16x4 hi)
208{
209 return (u8x8) _m_packuswb ((__m64) lo, (__m64) hi);
210}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700211
Dave Barachc3799992016-08-15 11:12:27 -0400212always_inline i8x8
213i16x4_pack (i16x4 lo, i16x4 hi)
214{
215 return (i8x8) _m_packsswb ((__m64) lo, (__m64) hi);
216}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700217
Dave Barachc3799992016-08-15 11:12:27 -0400218always_inline u16x4
219u32x2_pack (u32x2 lo, u32x2 hi)
220{
221 return (u16x4) _m_packssdw ((__m64) lo, (__m64) hi);
222}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700223
Dave Barachc3799992016-08-15 11:12:27 -0400224always_inline i16x4
225i32x2_pack (i32x2 lo, i32x2 hi)
226{
227 return (i16x4) _m_packssdw ((__m64) lo, (__m64) hi);
228}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700229
Ed Warnickecb9cada2015-12-08 15:45:58 -0700230#ifndef __ICC
Dave Barachc3799992016-08-15 11:12:27 -0400231always_inline u64x2
232u64x2_read_lo (u64x2 x, u64 * a)
233{
234 return (u64x2) _mm_loadl_pi ((__m128) x, (__m64 *) a);
235}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700236
Dave Barachc3799992016-08-15 11:12:27 -0400237always_inline u64x2
238u64x2_read_hi (u64x2 x, u64 * a)
239{
240 return (u64x2) _mm_loadh_pi ((__m128) x, (__m64 *) a);
241}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700242
Dave Barachc3799992016-08-15 11:12:27 -0400243always_inline void
244u64x2_write_lo (u64x2 x, u64 * a)
245{
246 _mm_storel_pi ((__m64 *) a, (__m128) x);
247}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700248
Dave Barachc3799992016-08-15 11:12:27 -0400249always_inline void
250u64x2_write_hi (u64x2 x, u64 * a)
251{
252 _mm_storeh_pi ((__m64 *) a, (__m128) x);
253}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700254#endif
255
Ed Warnickecb9cada2015-12-08 15:45:58 -0700256#define _signed_binop(n,m,f,g) \
257 /* Unsigned */ \
258 always_inline u##n##x##m \
259 u##n##x##m##_##f (u##n##x##m x, u##n##x##m y) \
260 { return (u##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); } \
261 \
262 /* Signed */ \
263 always_inline i##n##x##m \
264 i##n##x##m##_##f (i##n##x##m x, i##n##x##m y) \
265 { return (i##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); }
Ed Warnickecb9cada2015-12-08 15:45:58 -0700266/* Addition/subtraction with saturation. */
Damjan Mariona52e1662018-05-19 00:04:23 +0200267_signed_binop (8, 16, add_saturate, adds_epu)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700268_signed_binop (16, 8, add_saturate, adds_epu)
269_signed_binop (8, 16, sub_saturate, subs_epu)
270_signed_binop (16, 8, sub_saturate, subs_epu)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700271/* Multiplication. */
Dave Barachc3799992016-08-15 11:12:27 -0400272 always_inline i16x8 i16x8_mul_lo (i16x8 x, i16x8 y)
273{
274 return (i16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
275}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700276
Dave Barachc3799992016-08-15 11:12:27 -0400277always_inline u16x8
278u16x8_mul_lo (u16x8 x, u16x8 y)
279{
280 return (u16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
281}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700282
Dave Barachc3799992016-08-15 11:12:27 -0400283always_inline i16x8
284i16x8_mul_hi (i16x8 x, i16x8 y)
285{
286 return (i16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
287}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700288
Dave Barachc3799992016-08-15 11:12:27 -0400289always_inline u16x8
290u16x8_mul_hi (u16x8 x, u16x8 y)
291{
292 return (u16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
293}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700294
295/* 128 bit shifts. */
296
297#define _(p,a,b,c,f) \
298 always_inline p##a##x##b p##a##x##b##_ishift_##c (p##a##x##b x, int i) \
299 { return (p##a##x##b) _mm_##f##i_epi##a ((__m128i) x, i); } \
300 \
301 always_inline p##a##x##b p##a##x##b##_shift_##c (p##a##x##b x, p##a##x##b y) \
302 { return (p##a##x##b) _mm_##f##_epi##a ((__m128i) x, (__m128i) y); }
303
Dave Barachc3799992016-08-15 11:12:27 -0400304_(u, 16, 8, left, sll)
305_(u, 32, 4, left, sll)
306_(u, 64, 2, left, sll)
307_(u, 16, 8, right, srl)
308_(u, 32, 4, right, srl)
309_(u, 64, 2, right, srl)
310_(i, 16, 8, left, sll)
311_(i, 32, 4, left, sll)
312_(i, 64, 2, left, sll) _(i, 16, 8, right, sra) _(i, 32, 4, right, sra)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700313#undef _
Ed Warnickecb9cada2015-12-08 15:45:58 -0700314/* 64 bit shifts. */
Dave Barachc3799992016-08-15 11:12:27 -0400315 always_inline u16x4
316u16x4_shift_left (u16x4 x, u16x4 i)
317{
318 return (u16x4) _m_psllw ((__m64) x, (__m64) i);
319};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700320
Dave Barachc3799992016-08-15 11:12:27 -0400321always_inline u32x2
322u32x2_shift_left (u32x2 x, u32x2 i)
323{
324 return (u32x2) _m_pslld ((__m64) x, (__m64) i);
325};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700326
Dave Barachc3799992016-08-15 11:12:27 -0400327always_inline u16x4
328u16x4_shift_right (u16x4 x, u16x4 i)
329{
330 return (u16x4) _m_psrlw ((__m64) x, (__m64) i);
331};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700332
Dave Barachc3799992016-08-15 11:12:27 -0400333always_inline u32x2
334u32x2_shift_right (u32x2 x, u32x2 i)
335{
336 return (u32x2) _m_psrld ((__m64) x, (__m64) i);
337};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700338
Dave Barachc3799992016-08-15 11:12:27 -0400339always_inline i16x4
340i16x4_shift_left (i16x4 x, i16x4 i)
341{
342 return (i16x4) _m_psllw ((__m64) x, (__m64) i);
343};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700344
Dave Barachc3799992016-08-15 11:12:27 -0400345always_inline i32x2
346i32x2_shift_left (i32x2 x, i32x2 i)
347{
348 return (i32x2) _m_pslld ((__m64) x, (__m64) i);
349};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700350
Dave Barachc3799992016-08-15 11:12:27 -0400351always_inline i16x4
352i16x4_shift_right (i16x4 x, i16x4 i)
353{
354 return (i16x4) _m_psraw ((__m64) x, (__m64) i);
355};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700356
Dave Barachc3799992016-08-15 11:12:27 -0400357always_inline i32x2
358i32x2_shift_right (i32x2 x, i32x2 i)
359{
360 return (i32x2) _m_psrad ((__m64) x, (__m64) i);
361};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700362
363#define u8x16_word_shift_left(a,n) (u8x16) _mm_slli_si128((__m128i) a, n)
364#define u8x16_word_shift_right(a,n) (u8x16) _mm_srli_si128((__m128i) a, n)
365
366#define i8x16_word_shift_left(a,n) \
367 ((i8x16) u8x16_word_shift_left((u8x16) (a), (n)))
368#define i8x16_word_shift_right(a,n) \
369 ((i8x16) u8x16_word_shift_right((u8x16) (a), (n)))
370
371#define u16x8_word_shift_left(a,n) \
372 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
373#define i16x8_word_shift_left(a,n) \
374 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
375#define u16x8_word_shift_right(a,n) \
376 ((u16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
377#define i16x8_word_shift_right(a,n) \
378 ((i16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
379
380#define u32x4_word_shift_left(a,n) \
381 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
382#define i32x4_word_shift_left(a,n) \
383 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
384#define u32x4_word_shift_right(a,n) \
385 ((u32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
386#define i32x4_word_shift_right(a,n) \
387 ((i32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
388
389#define u64x2_word_shift_left(a,n) \
390 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
391#define i64x2_word_shift_left(a,n) \
392 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
393#define u64x2_word_shift_right(a,n) \
394 ((u64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
395#define i64x2_word_shift_right(a,n) \
396 ((i64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
397
398/* SSE2 has no rotate instructions: use shifts to simulate them. */
399#define _(t,n,lr1,lr2) \
400 always_inline t##x##n \
401 t##x##n##_irotate_##lr1 (t##x##n w, int i) \
402 { \
403 ASSERT (i >= 0 && i <= BITS (t)); \
404 return (t##x##n##_ishift_##lr1 (w, i) \
405 | t##x##n##_ishift_##lr2 (w, BITS (t) - i)); \
406 } \
407 \
408 always_inline t##x##n \
409 t##x##n##_rotate_##lr1 (t##x##n w, t##x##n i) \
410 { \
411 t##x##n j = t##x##n##_splat (BITS (t)); \
412 return (t##x##n##_shift_##lr1 (w, i) \
413 | t##x##n##_shift_##lr2 (w, j - i)); \
414 }
415
Dave Barachc3799992016-08-15 11:12:27 -0400416_(u16, 8, left, right);
417_(u16, 8, right, left);
418_(u32, 4, left, right);
419_(u32, 4, right, left);
420_(u64, 2, left, right);
421_(u64, 2, right, left);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700422
423#undef _
424
425#ifndef __clang__
426#define _(t,n,lr1,lr2) \
427 always_inline t##x##n \
428 t##x##n##_word_rotate2_##lr1 (t##x##n w0, t##x##n w1, int i) \
429 { \
430 int m = sizeof (t##x##n) / sizeof (t); \
431 ASSERT (i >= 0 && i < m); \
432 return (t##x##n##_word_shift_##lr1 (w0, i) \
433 | t##x##n##_word_shift_##lr2 (w1, m - i)); \
434 } \
435 \
436 always_inline t##x##n \
437 t##x##n##_word_rotate_##lr1 (t##x##n w0, int i) \
438 { return t##x##n##_word_rotate2_##lr1 (w0, w0, i); }
439
Dave Barachc3799992016-08-15 11:12:27 -0400440_(u8, 16, left, right);
441_(u8, 16, right, left);
442_(u16, 8, left, right);
443_(u16, 8, right, left);
444_(u32, 4, left, right);
445_(u32, 4, right, left);
446_(u64, 2, left, right);
447_(u64, 2, right, left);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700448
449#undef _
450#endif
451
Ed Warnickecb9cada2015-12-08 15:45:58 -0700452#define u32x4_select(A,MASK) \
453({ \
454 u32x4 _x, _y; \
455 _x = (A); \
456 asm volatile ("pshufd %[mask], %[x], %[y]" \
457 : /* outputs */ [y] "=x" (_y) \
458 : /* inputs */ [x] "x" (_x), [mask] "i" (MASK)); \
459 _y; \
460})
461
462#define u32x4_splat_word(x,i) \
463 u32x4_select ((x), (((i) << (2*0)) \
464 | ((i) << (2*1)) \
465 | ((i) << (2*2)) \
466 | ((i) << (2*3))))
467
468/* Extract low order 32 bit word. */
469always_inline u32
470u32x4_get0 (u32x4 x)
471{
472 u32 result;
Dave Barachc3799992016-08-15 11:12:27 -0400473 asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=r" (result)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700474 : /* inputs */ [x] "x" (x));
475 return result;
476}
477
478always_inline u32x4
479u32x4_set0 (u32 x)
480{
481 u32x4 result;
Dave Barachc3799992016-08-15 11:12:27 -0400482 asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=x" (result)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700483 : /* inputs */ [x] "r" (x));
484 return result;
485}
486
487always_inline i32x4
488i32x4_set0 (i32 x)
Dave Barachc3799992016-08-15 11:12:27 -0400489{
490 return (i32x4) u32x4_set0 ((u32) x);
491}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700492
493always_inline i32
494i32x4_get0 (i32x4 x)
Dave Barachc3799992016-08-15 11:12:27 -0400495{
496 return (i32) u32x4_get0 ((u32x4) x);
497}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700498
499/* Converts all ones/zeros compare mask to bitmap. */
Dave Barachc3799992016-08-15 11:12:27 -0400500always_inline u32
501u8x16_compare_byte_mask (u8x16 x)
502{
503 return _mm_movemask_epi8 ((__m128i) x);
504}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700505
Damjan Marionb8abf872016-03-14 20:02:35 +0100506extern u8 u32x4_compare_word_mask_table[256];
Ed Warnickecb9cada2015-12-08 15:45:58 -0700507
Dave Barachc3799992016-08-15 11:12:27 -0400508always_inline u32
509u32x4_compare_word_mask (u32x4 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700510{
511 u32 m = u8x16_compare_byte_mask ((u8x16) x);
512 return (u32x4_compare_word_mask_table[(m >> 0) & 0xff]
513 | (u32x4_compare_word_mask_table[(m >> 8) & 0xff] << 2));
514}
515
Dave Barachc3799992016-08-15 11:12:27 -0400516always_inline u32
517u8x16_zero_byte_mask (u8x16 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700518{
Dave Barachc3799992016-08-15 11:12:27 -0400519 u8x16 zero = { 0 };
Damjan Mariona52e1662018-05-19 00:04:23 +0200520 return u8x16_compare_byte_mask (x == zero);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700521}
522
Dave Barachc3799992016-08-15 11:12:27 -0400523always_inline u32
524u16x8_zero_byte_mask (u16x8 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700525{
Dave Barachc3799992016-08-15 11:12:27 -0400526 u16x8 zero = { 0 };
Damjan Mariona52e1662018-05-19 00:04:23 +0200527 return u8x16_compare_byte_mask ((u8x16) (x == zero));
Ed Warnickecb9cada2015-12-08 15:45:58 -0700528}
529
Dave Barachc3799992016-08-15 11:12:27 -0400530always_inline u32
531u32x4_zero_byte_mask (u32x4 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700532{
Dave Barachc3799992016-08-15 11:12:27 -0400533 u32x4 zero = { 0 };
Damjan Mariona52e1662018-05-19 00:04:23 +0200534 return u8x16_compare_byte_mask ((u8x16) (x == zero));
Ed Warnickecb9cada2015-12-08 15:45:58 -0700535}
536
Dave Barachc3799992016-08-15 11:12:27 -0400537always_inline u32
538u8x16_max_scalar (u8x16 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700539{
540 x = u8x16_max (x, u8x16_word_shift_right (x, 8));
541 x = u8x16_max (x, u8x16_word_shift_right (x, 4));
542 x = u8x16_max (x, u8x16_word_shift_right (x, 2));
543 x = u8x16_max (x, u8x16_word_shift_right (x, 1));
544 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
545}
546
Dave Barachc3799992016-08-15 11:12:27 -0400547always_inline u8
548u8x16_min_scalar (u8x16 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700549{
550 x = u8x16_min (x, u8x16_word_shift_right (x, 8));
551 x = u8x16_min (x, u8x16_word_shift_right (x, 4));
552 x = u8x16_min (x, u8x16_word_shift_right (x, 2));
553 x = u8x16_min (x, u8x16_word_shift_right (x, 1));
554 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
555}
556
Dave Barachc3799992016-08-15 11:12:27 -0400557always_inline i16
558i16x8_max_scalar (i16x8 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700559{
560 x = i16x8_max (x, i16x8_word_shift_right (x, 4));
561 x = i16x8_max (x, i16x8_word_shift_right (x, 2));
562 x = i16x8_max (x, i16x8_word_shift_right (x, 1));
563 return _mm_extract_epi16 ((__m128i) x, 0);
564}
565
Dave Barachc3799992016-08-15 11:12:27 -0400566always_inline i16
567i16x8_min_scalar (i16x8 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700568{
569 x = i16x8_min (x, i16x8_word_shift_right (x, 4));
570 x = i16x8_min (x, i16x8_word_shift_right (x, 2));
571 x = i16x8_min (x, i16x8_word_shift_right (x, 1));
572 return _mm_extract_epi16 ((__m128i) x, 0);
573}
574
Damjan Marionf6adf1f2019-04-02 19:06:50 +0200575#define u8x16_align_right(a, b, imm) \
576 (u8x16) _mm_alignr_epi8 ((__m128i) a, (__m128i) b, imm)
577
578static_always_inline u32
579u32x4_min_scalar (u32x4 v)
580{
581 v = u32x4_min (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
582 v = u32x4_min (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
583 return v[0];
584}
585
586static_always_inline u32
587u32x4_max_scalar (u32x4 v)
588{
589 v = u32x4_max (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
590 v = u32x4_max (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
591 return v[0];
592}
593
594static_always_inline u32
595i32x4_min_scalar (i32x4 v)
596{
597 v = i32x4_min (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
598 v = i32x4_min (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
599 return v[0];
600}
601
602static_always_inline u32
603i32x4_max_scalar (i32x4 v)
604{
605 v = i32x4_max (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
606 v = i32x4_max (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
607 return v[0];
608}
609
Damjan Marion8c3f8a22018-05-17 21:12:13 +0200610static_always_inline u16
611u8x16_msb_mask (u8x16 v)
612{
613 return _mm_movemask_epi8 ((__m128i) v);
614}
615
Damjan Marion398fdc12018-05-19 10:27:10 +0200616#define CLIB_HAVE_VEC128_MSB_MASK
617
Ed Warnickecb9cada2015-12-08 15:45:58 -0700618#undef _signed_binop
619
Damjan Marioncf18ca92019-04-13 00:13:34 +0200620static_always_inline u32x4
621u32x4_byte_swap (u32x4 v)
622{
623 u8x16 swap = {
624 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
625 };
626 return (u32x4) _mm_shuffle_epi8 ((__m128i) v, (__m128i) swap);
627}
628
Damjan Marionbf129f42018-06-27 13:03:26 +0200629static_always_inline u16x8
630u16x8_byte_swap (u16x8 v)
631{
632 u8x16 swap = {
633 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
634 };
635 return (u16x8) _mm_shuffle_epi8 ((__m128i) v, (__m128i) swap);
636}
637
Damjan Marion622b5ce2020-02-12 10:59:14 +0100638static_always_inline u8x16
639u8x16_reflect (u8x16 v)
640{
641 u8x16 mask = {
642 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
643 };
644 return (u8x16) _mm_shuffle_epi8 ((__m128i) v, (__m128i) mask);
645}
646
Damjan Marionbf129f42018-06-27 13:03:26 +0200647static_always_inline u32x4
648u32x4_hadd (u32x4 v1, u32x4 v2)
649{
650 return (u32x4) _mm_hadd_epi32 ((__m128i) v1, (__m128i) v2);
651}
652
Damjan Marion94dbf952020-07-15 20:18:39 +0200653static_always_inline u32 __clib_unused
654u32x4_sum_elts (u32x4 sum4)
655{
656 sum4 += (u32x4) u8x16_align_right (sum4, sum4, 8);
657 sum4 += (u32x4) u8x16_align_right (sum4, sum4, 4);
658 return sum4[0];
659}
660
Damjan Marion48a87f22018-07-26 10:25:58 +0200661static_always_inline u8x16
662u8x16_shuffle (u8x16 v, u8x16 m)
663{
664 return (u8x16) _mm_shuffle_epi8 ((__m128i) v, (__m128i) m);
665}
666
Damjan Marion5df580e2018-07-27 01:47:57 +0200667static_always_inline u32x4
668u32x4_shuffle (u32x4 v, const int a, const int b, const int c, const int d)
669{
Damjan Marion612dd6a2018-07-30 12:45:07 +0200670#if defined(__clang__) || !__OPTIMIZE__
Damjan Marion5df580e2018-07-27 01:47:57 +0200671 u32x4 r = { v[a], v[b], v[c], v[d] };
672 return r;
673#else
674 return (u32x4) _mm_shuffle_epi32 ((__m128i) v,
675 a | b << 2 | c << 4 | d << 6);
676#endif
677}
678
Damjan Marion90d05bc2020-08-31 17:18:26 +0200679/* _from_ */
Damjan Marion5df580e2018-07-27 01:47:57 +0200680/* *INDENT-OFF* */
681#define _(f,t,i) \
682static_always_inline t \
Damjan Marion90d05bc2020-08-31 17:18:26 +0200683t##_from_##f (f x) \
Damjan Marion5df580e2018-07-27 01:47:57 +0200684{ return (t) _mm_cvt##i ((__m128i) x); }
685
686_(u8x16, u16x8, epu8_epi16)
687_(u8x16, u32x4, epu8_epi32)
688_(u8x16, u64x2, epu8_epi64)
689_(u16x8, u32x4, epu16_epi32)
690_(u16x8, u64x2, epu16_epi64)
691_(u32x4, u64x2, epu32_epi64)
692
693_(i8x16, i16x8, epi8_epi16)
694_(i8x16, i32x4, epi8_epi32)
695_(i8x16, i64x2, epi8_epi64)
696_(i16x8, i32x4, epi16_epi32)
697_(i16x8, i64x2, epi16_epi64)
698_(i32x4, i64x2, epi32_epi64)
699#undef _
700/* *INDENT-ON* */
701
Damjan Marion07243572018-11-20 10:06:57 +0100702static_always_inline u64x2
703u64x2_gather (void *p0, void *p1)
704{
705 u64x2 r = { *(u64 *) p0, *(u64 *) p1 };
706 return r;
707}
708
709static_always_inline u32x4
Damjan Marion97b9e002020-06-23 19:01:56 +0200710u32x4_gather (void *p0, void *p1, void *p2, void *p3)
Damjan Marion07243572018-11-20 10:06:57 +0100711{
712 u32x4 r = { *(u32 *) p0, *(u32 *) p1, *(u32 *) p2, *(u32 *) p3 };
713 return r;
714}
715
716
717static_always_inline void
718u64x2_scatter (u64x2 r, void *p0, void *p1)
719{
720 *(u64 *) p0 = r[0];
721 *(u64 *) p1 = r[1];
722}
723
724static_always_inline void
725u32x4_scatter (u32x4 r, void *p0, void *p1, void *p2, void *p3)
726{
727 *(u32 *) p0 = r[0];
728 *(u32 *) p1 = r[1];
729 *(u32 *) p2 = r[2];
730 *(u32 *) p3 = r[3];
731}
732
733static_always_inline void
734u64x2_scatter_one (u64x2 r, int index, void *p)
735{
736 *(u64 *) p = r[index];
737}
738
739static_always_inline void
740u32x4_scatter_one (u32x4 r, int index, void *p)
741{
742 *(u32 *) p = r[index];
743}
744
Damjan Marionc59b9a22019-03-19 15:38:40 +0100745static_always_inline u8x16
746u8x16_is_greater (u8x16 v1, u8x16 v2)
747{
748 return (u8x16) _mm_cmpgt_epi8 ((__m128i) v1, (__m128i) v2);
749}
750
751static_always_inline u8x16
752u8x16_blend (u8x16 v1, u8x16 v2, u8x16 mask)
753{
754 return (u8x16) _mm_blendv_epi8 ((__m128i) v1, (__m128i) v2, (__m128i) mask);
755}
756
Damjan Marionf75defa2020-02-13 18:14:06 +0100757static_always_inline u8x16
758u8x16_xor3 (u8x16 a, u8x16 b, u8x16 c)
759{
760#if __AVX512F__
761 return (u8x16) _mm_ternarylogic_epi32 ((__m128i) a, (__m128i) b,
762 (__m128i) c, 0x96);
763#endif
764 return a ^ b ^ c;
765}
Damjan Marion5df580e2018-07-27 01:47:57 +0200766
Damjan Marion94dbf952020-07-15 20:18:39 +0200767#ifdef __AVX512F__
768static_always_inline u8x16
769u8x16_mask_load (u8x16 a, void *p, u16 mask)
770{
771 return (u8x16) _mm_mask_loadu_epi8 ((__m128i) a, mask, p);
772}
773#endif
774
Ed Warnickecb9cada2015-12-08 15:45:58 -0700775#endif /* included_vector_sse2_h */
Dave Barachc3799992016-08-15 11:12:27 -0400776
777/*
778 * fd.io coding-style-patch-verification: ON
779 *
780 * Local Variables:
781 * eval: (c-set-style "gnu")
782 * End:
783 */