blob: 5d6a47d3915dfef901460e2a5584a52f6171015c [file] [log] [blame]
Ed Warnickecb9cada2015-12-08 15:45:58 -07001/*
2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15/*
16 Copyright (c) 2005 Eliot Dresselhaus
17
18 Permission is hereby granted, free of charge, to any person obtaining
19 a copy of this software and associated documentation files (the
20 "Software"), to deal in the Software without restriction, including
21 without limitation the rights to use, copy, modify, merge, publish,
22 distribute, sublicense, and/or sell copies of the Software, and to
23 permit persons to whom the Software is furnished to do so, subject to
24 the following conditions:
25
26 The above copyright notice and this permission notice shall be
27 included in all copies or substantial portions of the Software.
28
29 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
33 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
34 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
35 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
36*/
37
38#ifndef included_vector_sse2_h
39#define included_vector_sse2_h
40
Dave Barachc3799992016-08-15 11:12:27 -040041#include <vppinfra/error_bootstrap.h> /* for ASSERT */
Ed Warnickecb9cada2015-12-08 15:45:58 -070042#include <x86intrin.h>
43
Damjan Mariona52e1662018-05-19 00:04:23 +020044/* *INDENT-OFF* */
45#define foreach_sse42_vec128i \
46 _(i,8,16,epi8) _(i,16,8,epi16) _(i,32,4,epi32) _(i,64,2,epi64x)
47#define foreach_sse42_vec128u \
48 _(u,8,16,epi8) _(u,16,8,epi16) _(u,32,4,epi32) _(u,64,2,epi64x)
49#define foreach_sse42_vec128f \
50 _(f,32,4,ps) _(f,64,2,pd)
51
52/* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
53 is_all_equal */
54#define _(t, s, c, i) \
55static_always_inline t##s##x##c \
56t##s##x##c##_splat (t##s x) \
57{ return (t##s##x##c) _mm_set1_##i (x); } \
58\
59static_always_inline t##s##x##c \
60t##s##x##c##_load_unaligned (void *p) \
61{ return (t##s##x##c) _mm_loadu_si128 (p); } \
62\
63static_always_inline void \
64t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
65{ _mm_storeu_si128 ((__m128i *) p, (__m128i) v); } \
66\
67static_always_inline int \
68t##s##x##c##_is_all_zero (t##s##x##c x) \
69{ return _mm_testz_si128 ((__m128i) x, (__m128i) x); } \
70\
71static_always_inline int \
Damjan Marion14864772018-05-22 14:07:47 +020072t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
73{ return t##s##x##c##_is_all_zero (a ^ b); } \
Damjan Mariona52e1662018-05-19 00:04:23 +020074\
75static_always_inline int \
76t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
77{ return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); }; \
78
79foreach_sse42_vec128i foreach_sse42_vec128u
80#undef _
81/* *INDENT-ON* */
82
83#define CLIB_VEC128_SPLAT_DEFINED
84#define CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE
85
Ed Warnickecb9cada2015-12-08 15:45:58 -070086/* 128 bit interleaves. */
Dave Barachc3799992016-08-15 11:12:27 -040087always_inline u8x16
88u8x16_interleave_hi (u8x16 a, u8x16 b)
89{
90 return (u8x16) _mm_unpackhi_epi8 ((__m128i) a, (__m128i) b);
91}
Ed Warnickecb9cada2015-12-08 15:45:58 -070092
Dave Barachc3799992016-08-15 11:12:27 -040093always_inline u8x16
94u8x16_interleave_lo (u8x16 a, u8x16 b)
95{
96 return (u8x16) _mm_unpacklo_epi8 ((__m128i) a, (__m128i) b);
97}
Ed Warnickecb9cada2015-12-08 15:45:58 -070098
Dave Barachc3799992016-08-15 11:12:27 -040099always_inline u16x8
100u16x8_interleave_hi (u16x8 a, u16x8 b)
101{
102 return (u16x8) _mm_unpackhi_epi16 ((__m128i) a, (__m128i) b);
103}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700104
Dave Barachc3799992016-08-15 11:12:27 -0400105always_inline u16x8
106u16x8_interleave_lo (u16x8 a, u16x8 b)
107{
108 return (u16x8) _mm_unpacklo_epi16 ((__m128i) a, (__m128i) b);
109}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700110
Dave Barachc3799992016-08-15 11:12:27 -0400111always_inline u32x4
112u32x4_interleave_hi (u32x4 a, u32x4 b)
113{
114 return (u32x4) _mm_unpackhi_epi32 ((__m128i) a, (__m128i) b);
115}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700116
Dave Barachc3799992016-08-15 11:12:27 -0400117always_inline u32x4
118u32x4_interleave_lo (u32x4 a, u32x4 b)
119{
120 return (u32x4) _mm_unpacklo_epi32 ((__m128i) a, (__m128i) b);
121}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700122
Dave Barachc3799992016-08-15 11:12:27 -0400123always_inline u64x2
124u64x2_interleave_hi (u64x2 a, u64x2 b)
125{
126 return (u64x2) _mm_unpackhi_epi64 ((__m128i) a, (__m128i) b);
127}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700128
Dave Barachc3799992016-08-15 11:12:27 -0400129always_inline u64x2
130u64x2_interleave_lo (u64x2 a, u64x2 b)
131{
132 return (u64x2) _mm_unpacklo_epi64 ((__m128i) a, (__m128i) b);
133}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700134
135/* 64 bit interleaves. */
Dave Barachc3799992016-08-15 11:12:27 -0400136always_inline u8x8
137u8x8_interleave_hi (u8x8 a, u8x8 b)
138{
139 return (u8x8) _m_punpckhbw ((__m64) a, (__m64) b);
140}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700141
Dave Barachc3799992016-08-15 11:12:27 -0400142always_inline u8x8
143u8x8_interleave_lo (u8x8 a, u8x8 b)
144{
145 return (u8x8) _m_punpcklbw ((__m64) a, (__m64) b);
146}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700147
Dave Barachc3799992016-08-15 11:12:27 -0400148always_inline u16x4
149u16x4_interleave_hi (u16x4 a, u16x4 b)
150{
151 return (u16x4) _m_punpckhwd ((__m64) a, (__m64) b);
152}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700153
Dave Barachc3799992016-08-15 11:12:27 -0400154always_inline u16x4
155u16x4_interleave_lo (u16x4 a, u16x4 b)
156{
157 return (u16x4) _m_punpcklwd ((__m64) a, (__m64) b);
158}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700159
Dave Barachc3799992016-08-15 11:12:27 -0400160always_inline u32x2
161u32x2_interleave_hi (u32x2 a, u32x2 b)
162{
163 return (u32x2) _m_punpckhdq ((__m64) a, (__m64) b);
164}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700165
Dave Barachc3799992016-08-15 11:12:27 -0400166always_inline u32x2
167u32x2_interleave_lo (u32x2 a, u32x2 b)
168{
169 return (u32x2) _m_punpckldq ((__m64) a, (__m64) b);
170}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700171
172/* 128 bit packs. */
Dave Barachc3799992016-08-15 11:12:27 -0400173always_inline u8x16
174u16x8_pack (u16x8 lo, u16x8 hi)
175{
176 return (u8x16) _mm_packus_epi16 ((__m128i) lo, (__m128i) hi);
177}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700178
Dave Barachc3799992016-08-15 11:12:27 -0400179always_inline i8x16
180i16x8_pack (i16x8 lo, i16x8 hi)
181{
182 return (i8x16) _mm_packs_epi16 ((__m128i) lo, (__m128i) hi);
183}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700184
Dave Barachc3799992016-08-15 11:12:27 -0400185always_inline u16x8
186u32x4_pack (u32x4 lo, u32x4 hi)
187{
188 return (u16x8) _mm_packs_epi32 ((__m128i) lo, (__m128i) hi);
189}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700190
191/* 64 bit packs. */
Dave Barachc3799992016-08-15 11:12:27 -0400192always_inline u8x8
193u16x4_pack (u16x4 lo, u16x4 hi)
194{
195 return (u8x8) _m_packuswb ((__m64) lo, (__m64) hi);
196}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700197
Dave Barachc3799992016-08-15 11:12:27 -0400198always_inline i8x8
199i16x4_pack (i16x4 lo, i16x4 hi)
200{
201 return (i8x8) _m_packsswb ((__m64) lo, (__m64) hi);
202}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700203
Dave Barachc3799992016-08-15 11:12:27 -0400204always_inline u16x4
205u32x2_pack (u32x2 lo, u32x2 hi)
206{
207 return (u16x4) _m_packssdw ((__m64) lo, (__m64) hi);
208}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700209
Dave Barachc3799992016-08-15 11:12:27 -0400210always_inline i16x4
211i32x2_pack (i32x2 lo, i32x2 hi)
212{
213 return (i16x4) _m_packssdw ((__m64) lo, (__m64) hi);
214}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700215
Ed Warnickecb9cada2015-12-08 15:45:58 -0700216#ifndef __ICC
Dave Barachc3799992016-08-15 11:12:27 -0400217always_inline u64x2
218u64x2_read_lo (u64x2 x, u64 * a)
219{
220 return (u64x2) _mm_loadl_pi ((__m128) x, (__m64 *) a);
221}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700222
Dave Barachc3799992016-08-15 11:12:27 -0400223always_inline u64x2
224u64x2_read_hi (u64x2 x, u64 * a)
225{
226 return (u64x2) _mm_loadh_pi ((__m128) x, (__m64 *) a);
227}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700228
Dave Barachc3799992016-08-15 11:12:27 -0400229always_inline void
230u64x2_write_lo (u64x2 x, u64 * a)
231{
232 _mm_storel_pi ((__m64 *) a, (__m128) x);
233}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700234
Dave Barachc3799992016-08-15 11:12:27 -0400235always_inline void
236u64x2_write_hi (u64x2 x, u64 * a)
237{
238 _mm_storeh_pi ((__m64 *) a, (__m128) x);
239}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700240#endif
241
Ed Warnickecb9cada2015-12-08 15:45:58 -0700242#define _signed_binop(n,m,f,g) \
243 /* Unsigned */ \
244 always_inline u##n##x##m \
245 u##n##x##m##_##f (u##n##x##m x, u##n##x##m y) \
246 { return (u##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); } \
247 \
248 /* Signed */ \
249 always_inline i##n##x##m \
250 i##n##x##m##_##f (i##n##x##m x, i##n##x##m y) \
251 { return (i##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); }
Ed Warnickecb9cada2015-12-08 15:45:58 -0700252/* Addition/subtraction with saturation. */
Damjan Mariona52e1662018-05-19 00:04:23 +0200253_signed_binop (8, 16, add_saturate, adds_epu)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700254_signed_binop (16, 8, add_saturate, adds_epu)
255_signed_binop (8, 16, sub_saturate, subs_epu)
256_signed_binop (16, 8, sub_saturate, subs_epu)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700257/* Multiplication. */
Dave Barachc3799992016-08-15 11:12:27 -0400258 always_inline i16x8 i16x8_mul_lo (i16x8 x, i16x8 y)
259{
260 return (i16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
261}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700262
Dave Barachc3799992016-08-15 11:12:27 -0400263always_inline u16x8
264u16x8_mul_lo (u16x8 x, u16x8 y)
265{
266 return (u16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
267}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700268
Dave Barachc3799992016-08-15 11:12:27 -0400269always_inline i16x8
270i16x8_mul_hi (i16x8 x, i16x8 y)
271{
272 return (i16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
273}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700274
Dave Barachc3799992016-08-15 11:12:27 -0400275always_inline u16x8
276u16x8_mul_hi (u16x8 x, u16x8 y)
277{
278 return (u16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
279}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700280
281/* 128 bit shifts. */
282
283#define _(p,a,b,c,f) \
284 always_inline p##a##x##b p##a##x##b##_ishift_##c (p##a##x##b x, int i) \
285 { return (p##a##x##b) _mm_##f##i_epi##a ((__m128i) x, i); } \
286 \
287 always_inline p##a##x##b p##a##x##b##_shift_##c (p##a##x##b x, p##a##x##b y) \
288 { return (p##a##x##b) _mm_##f##_epi##a ((__m128i) x, (__m128i) y); }
289
Dave Barachc3799992016-08-15 11:12:27 -0400290_(u, 16, 8, left, sll)
291_(u, 32, 4, left, sll)
292_(u, 64, 2, left, sll)
293_(u, 16, 8, right, srl)
294_(u, 32, 4, right, srl)
295_(u, 64, 2, right, srl)
296_(i, 16, 8, left, sll)
297_(i, 32, 4, left, sll)
298_(i, 64, 2, left, sll) _(i, 16, 8, right, sra) _(i, 32, 4, right, sra)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700299#undef _
Ed Warnickecb9cada2015-12-08 15:45:58 -0700300/* 64 bit shifts. */
Dave Barachc3799992016-08-15 11:12:27 -0400301 always_inline u16x4
302u16x4_shift_left (u16x4 x, u16x4 i)
303{
304 return (u16x4) _m_psllw ((__m64) x, (__m64) i);
305};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700306
Dave Barachc3799992016-08-15 11:12:27 -0400307always_inline u32x2
308u32x2_shift_left (u32x2 x, u32x2 i)
309{
310 return (u32x2) _m_pslld ((__m64) x, (__m64) i);
311};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700312
Dave Barachc3799992016-08-15 11:12:27 -0400313always_inline u16x4
314u16x4_shift_right (u16x4 x, u16x4 i)
315{
316 return (u16x4) _m_psrlw ((__m64) x, (__m64) i);
317};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700318
Dave Barachc3799992016-08-15 11:12:27 -0400319always_inline u32x2
320u32x2_shift_right (u32x2 x, u32x2 i)
321{
322 return (u32x2) _m_psrld ((__m64) x, (__m64) i);
323};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700324
Dave Barachc3799992016-08-15 11:12:27 -0400325always_inline i16x4
326i16x4_shift_left (i16x4 x, i16x4 i)
327{
328 return (i16x4) _m_psllw ((__m64) x, (__m64) i);
329};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700330
Dave Barachc3799992016-08-15 11:12:27 -0400331always_inline i32x2
332i32x2_shift_left (i32x2 x, i32x2 i)
333{
334 return (i32x2) _m_pslld ((__m64) x, (__m64) i);
335};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700336
Dave Barachc3799992016-08-15 11:12:27 -0400337always_inline i16x4
338i16x4_shift_right (i16x4 x, i16x4 i)
339{
340 return (i16x4) _m_psraw ((__m64) x, (__m64) i);
341};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700342
Dave Barachc3799992016-08-15 11:12:27 -0400343always_inline i32x2
344i32x2_shift_right (i32x2 x, i32x2 i)
345{
346 return (i32x2) _m_psrad ((__m64) x, (__m64) i);
347};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700348
349#define u8x16_word_shift_left(a,n) (u8x16) _mm_slli_si128((__m128i) a, n)
350#define u8x16_word_shift_right(a,n) (u8x16) _mm_srli_si128((__m128i) a, n)
351
352#define i8x16_word_shift_left(a,n) \
353 ((i8x16) u8x16_word_shift_left((u8x16) (a), (n)))
354#define i8x16_word_shift_right(a,n) \
355 ((i8x16) u8x16_word_shift_right((u8x16) (a), (n)))
356
357#define u16x8_word_shift_left(a,n) \
358 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
359#define i16x8_word_shift_left(a,n) \
360 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
361#define u16x8_word_shift_right(a,n) \
362 ((u16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
363#define i16x8_word_shift_right(a,n) \
364 ((i16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
365
366#define u32x4_word_shift_left(a,n) \
367 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
368#define i32x4_word_shift_left(a,n) \
369 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
370#define u32x4_word_shift_right(a,n) \
371 ((u32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
372#define i32x4_word_shift_right(a,n) \
373 ((i32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
374
375#define u64x2_word_shift_left(a,n) \
376 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
377#define i64x2_word_shift_left(a,n) \
378 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
379#define u64x2_word_shift_right(a,n) \
380 ((u64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
381#define i64x2_word_shift_right(a,n) \
382 ((i64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
383
384/* SSE2 has no rotate instructions: use shifts to simulate them. */
385#define _(t,n,lr1,lr2) \
386 always_inline t##x##n \
387 t##x##n##_irotate_##lr1 (t##x##n w, int i) \
388 { \
389 ASSERT (i >= 0 && i <= BITS (t)); \
390 return (t##x##n##_ishift_##lr1 (w, i) \
391 | t##x##n##_ishift_##lr2 (w, BITS (t) - i)); \
392 } \
393 \
394 always_inline t##x##n \
395 t##x##n##_rotate_##lr1 (t##x##n w, t##x##n i) \
396 { \
397 t##x##n j = t##x##n##_splat (BITS (t)); \
398 return (t##x##n##_shift_##lr1 (w, i) \
399 | t##x##n##_shift_##lr2 (w, j - i)); \
400 }
401
Dave Barachc3799992016-08-15 11:12:27 -0400402_(u16, 8, left, right);
403_(u16, 8, right, left);
404_(u32, 4, left, right);
405_(u32, 4, right, left);
406_(u64, 2, left, right);
407_(u64, 2, right, left);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700408
409#undef _
410
411#ifndef __clang__
412#define _(t,n,lr1,lr2) \
413 always_inline t##x##n \
414 t##x##n##_word_rotate2_##lr1 (t##x##n w0, t##x##n w1, int i) \
415 { \
416 int m = sizeof (t##x##n) / sizeof (t); \
417 ASSERT (i >= 0 && i < m); \
418 return (t##x##n##_word_shift_##lr1 (w0, i) \
419 | t##x##n##_word_shift_##lr2 (w1, m - i)); \
420 } \
421 \
422 always_inline t##x##n \
423 t##x##n##_word_rotate_##lr1 (t##x##n w0, int i) \
424 { return t##x##n##_word_rotate2_##lr1 (w0, w0, i); }
425
Dave Barachc3799992016-08-15 11:12:27 -0400426_(u8, 16, left, right);
427_(u8, 16, right, left);
428_(u16, 8, left, right);
429_(u16, 8, right, left);
430_(u32, 4, left, right);
431_(u32, 4, right, left);
432_(u64, 2, left, right);
433_(u64, 2, right, left);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700434
435#undef _
436#endif
437
Ed Warnickecb9cada2015-12-08 15:45:58 -0700438#define u32x4_select(A,MASK) \
439({ \
440 u32x4 _x, _y; \
441 _x = (A); \
442 asm volatile ("pshufd %[mask], %[x], %[y]" \
443 : /* outputs */ [y] "=x" (_y) \
444 : /* inputs */ [x] "x" (_x), [mask] "i" (MASK)); \
445 _y; \
446})
447
448#define u32x4_splat_word(x,i) \
449 u32x4_select ((x), (((i) << (2*0)) \
450 | ((i) << (2*1)) \
451 | ((i) << (2*2)) \
452 | ((i) << (2*3))))
453
454/* Extract low order 32 bit word. */
455always_inline u32
456u32x4_get0 (u32x4 x)
457{
458 u32 result;
Dave Barachc3799992016-08-15 11:12:27 -0400459 asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=r" (result)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700460 : /* inputs */ [x] "x" (x));
461 return result;
462}
463
464always_inline u32x4
465u32x4_set0 (u32 x)
466{
467 u32x4 result;
Dave Barachc3799992016-08-15 11:12:27 -0400468 asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=x" (result)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700469 : /* inputs */ [x] "r" (x));
470 return result;
471}
472
473always_inline i32x4
474i32x4_set0 (i32 x)
Dave Barachc3799992016-08-15 11:12:27 -0400475{
476 return (i32x4) u32x4_set0 ((u32) x);
477}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700478
479always_inline i32
480i32x4_get0 (i32x4 x)
Dave Barachc3799992016-08-15 11:12:27 -0400481{
482 return (i32) u32x4_get0 ((u32x4) x);
483}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700484
485/* Converts all ones/zeros compare mask to bitmap. */
Dave Barachc3799992016-08-15 11:12:27 -0400486always_inline u32
487u8x16_compare_byte_mask (u8x16 x)
488{
489 return _mm_movemask_epi8 ((__m128i) x);
490}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700491
Damjan Marionb8abf872016-03-14 20:02:35 +0100492extern u8 u32x4_compare_word_mask_table[256];
Ed Warnickecb9cada2015-12-08 15:45:58 -0700493
Dave Barachc3799992016-08-15 11:12:27 -0400494always_inline u32
495u32x4_compare_word_mask (u32x4 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700496{
497 u32 m = u8x16_compare_byte_mask ((u8x16) x);
498 return (u32x4_compare_word_mask_table[(m >> 0) & 0xff]
499 | (u32x4_compare_word_mask_table[(m >> 8) & 0xff] << 2));
500}
501
Dave Barachc3799992016-08-15 11:12:27 -0400502always_inline u32
503u8x16_zero_byte_mask (u8x16 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700504{
Dave Barachc3799992016-08-15 11:12:27 -0400505 u8x16 zero = { 0 };
Damjan Mariona52e1662018-05-19 00:04:23 +0200506 return u8x16_compare_byte_mask (x == zero);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700507}
508
Dave Barachc3799992016-08-15 11:12:27 -0400509always_inline u32
510u16x8_zero_byte_mask (u16x8 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700511{
Dave Barachc3799992016-08-15 11:12:27 -0400512 u16x8 zero = { 0 };
Damjan Mariona52e1662018-05-19 00:04:23 +0200513 return u8x16_compare_byte_mask ((u8x16) (x == zero));
Ed Warnickecb9cada2015-12-08 15:45:58 -0700514}
515
Dave Barachc3799992016-08-15 11:12:27 -0400516always_inline u32
517u32x4_zero_byte_mask (u32x4 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700518{
Dave Barachc3799992016-08-15 11:12:27 -0400519 u32x4 zero = { 0 };
Damjan Mariona52e1662018-05-19 00:04:23 +0200520 return u8x16_compare_byte_mask ((u8x16) (x == zero));
Ed Warnickecb9cada2015-12-08 15:45:58 -0700521}
522
Dave Barachc3799992016-08-15 11:12:27 -0400523always_inline u8x16
524u8x16_max (u8x16 x, u8x16 y)
525{
526 return (u8x16) _mm_max_epu8 ((__m128i) x, (__m128i) y);
527}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700528
Dave Barachc3799992016-08-15 11:12:27 -0400529always_inline u32
530u8x16_max_scalar (u8x16 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700531{
532 x = u8x16_max (x, u8x16_word_shift_right (x, 8));
533 x = u8x16_max (x, u8x16_word_shift_right (x, 4));
534 x = u8x16_max (x, u8x16_word_shift_right (x, 2));
535 x = u8x16_max (x, u8x16_word_shift_right (x, 1));
536 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
537}
538
Dave Barachc3799992016-08-15 11:12:27 -0400539always_inline u8x16
540u8x16_min (u8x16 x, u8x16 y)
541{
542 return (u8x16) _mm_min_epu8 ((__m128i) x, (__m128i) y);
543}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700544
Dave Barachc3799992016-08-15 11:12:27 -0400545always_inline u8
546u8x16_min_scalar (u8x16 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700547{
548 x = u8x16_min (x, u8x16_word_shift_right (x, 8));
549 x = u8x16_min (x, u8x16_word_shift_right (x, 4));
550 x = u8x16_min (x, u8x16_word_shift_right (x, 2));
551 x = u8x16_min (x, u8x16_word_shift_right (x, 1));
552 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
553}
554
Dave Barachc3799992016-08-15 11:12:27 -0400555always_inline i16x8
556i16x8_max (i16x8 x, i16x8 y)
557{
558 return (i16x8) _mm_max_epi16 ((__m128i) x, (__m128i) y);
559}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700560
Dave Barachc3799992016-08-15 11:12:27 -0400561always_inline i16
562i16x8_max_scalar (i16x8 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700563{
564 x = i16x8_max (x, i16x8_word_shift_right (x, 4));
565 x = i16x8_max (x, i16x8_word_shift_right (x, 2));
566 x = i16x8_max (x, i16x8_word_shift_right (x, 1));
567 return _mm_extract_epi16 ((__m128i) x, 0);
568}
569
Dave Barachc3799992016-08-15 11:12:27 -0400570always_inline i16x8
571i16x8_min (i16x8 x, i16x8 y)
572{
573 return (i16x8) _mm_min_epi16 ((__m128i) x, (__m128i) y);
574}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700575
Dave Barachc3799992016-08-15 11:12:27 -0400576always_inline i16
577i16x8_min_scalar (i16x8 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700578{
579 x = i16x8_min (x, i16x8_word_shift_right (x, 4));
580 x = i16x8_min (x, i16x8_word_shift_right (x, 2));
581 x = i16x8_min (x, i16x8_word_shift_right (x, 1));
582 return _mm_extract_epi16 ((__m128i) x, 0);
583}
584
Damjan Marion8c3f8a22018-05-17 21:12:13 +0200585static_always_inline u16
586u8x16_msb_mask (u8x16 v)
587{
588 return _mm_movemask_epi8 ((__m128i) v);
589}
590
Damjan Marion398fdc12018-05-19 10:27:10 +0200591#define CLIB_HAVE_VEC128_MSB_MASK
592
Ed Warnickecb9cada2015-12-08 15:45:58 -0700593#undef _signed_binop
594
Damjan Marionbf129f42018-06-27 13:03:26 +0200595static_always_inline u16x8
596u16x8_byte_swap (u16x8 v)
597{
598 u8x16 swap = {
599 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
600 };
601 return (u16x8) _mm_shuffle_epi8 ((__m128i) v, (__m128i) swap);
602}
603
604static_always_inline u32x4
605u32x4_hadd (u32x4 v1, u32x4 v2)
606{
607 return (u32x4) _mm_hadd_epi32 ((__m128i) v1, (__m128i) v2);
608}
609
Damjan Marion48a87f22018-07-26 10:25:58 +0200610static_always_inline u8x16
611u8x16_shuffle (u8x16 v, u8x16 m)
612{
613 return (u8x16) _mm_shuffle_epi8 ((__m128i) v, (__m128i) m);
614}
615
Damjan Marion5df580e2018-07-27 01:47:57 +0200616static_always_inline u32x4
617u32x4_shuffle (u32x4 v, const int a, const int b, const int c, const int d)
618{
Damjan Marion612dd6a2018-07-30 12:45:07 +0200619#if defined(__clang__) || !__OPTIMIZE__
Damjan Marion5df580e2018-07-27 01:47:57 +0200620 u32x4 r = { v[a], v[b], v[c], v[d] };
621 return r;
622#else
623 return (u32x4) _mm_shuffle_epi32 ((__m128i) v,
624 a | b << 2 | c << 4 | d << 6);
625#endif
626}
627
628/* _extend_to_ */
629/* *INDENT-OFF* */
630#define _(f,t,i) \
631static_always_inline t \
632f##_extend_to_##t (f x) \
633{ return (t) _mm_cvt##i ((__m128i) x); }
634
635_(u8x16, u16x8, epu8_epi16)
636_(u8x16, u32x4, epu8_epi32)
637_(u8x16, u64x2, epu8_epi64)
638_(u16x8, u32x4, epu16_epi32)
639_(u16x8, u64x2, epu16_epi64)
640_(u32x4, u64x2, epu32_epi64)
641
642_(i8x16, i16x8, epi8_epi16)
643_(i8x16, i32x4, epi8_epi32)
644_(i8x16, i64x2, epi8_epi64)
645_(i16x8, i32x4, epi16_epi32)
646_(i16x8, i64x2, epi16_epi64)
647_(i32x4, i64x2, epi32_epi64)
648#undef _
649/* *INDENT-ON* */
650
Damjan Marion07243572018-11-20 10:06:57 +0100651static_always_inline u64x2
652u64x2_gather (void *p0, void *p1)
653{
654 u64x2 r = { *(u64 *) p0, *(u64 *) p1 };
655 return r;
656}
657
658static_always_inline u32x4
659u32x4_gather (void *p0, void *p1, void *p2, void *p3, void *p4)
660{
661 u32x4 r = { *(u32 *) p0, *(u32 *) p1, *(u32 *) p2, *(u32 *) p3 };
662 return r;
663}
664
665
666static_always_inline void
667u64x2_scatter (u64x2 r, void *p0, void *p1)
668{
669 *(u64 *) p0 = r[0];
670 *(u64 *) p1 = r[1];
671}
672
673static_always_inline void
674u32x4_scatter (u32x4 r, void *p0, void *p1, void *p2, void *p3)
675{
676 *(u32 *) p0 = r[0];
677 *(u32 *) p1 = r[1];
678 *(u32 *) p2 = r[2];
679 *(u32 *) p3 = r[3];
680}
681
682static_always_inline void
683u64x2_scatter_one (u64x2 r, int index, void *p)
684{
685 *(u64 *) p = r[index];
686}
687
688static_always_inline void
689u32x4_scatter_one (u32x4 r, int index, void *p)
690{
691 *(u32 *) p = r[index];
692}
693
Damjan Marion5df580e2018-07-27 01:47:57 +0200694
Ed Warnickecb9cada2015-12-08 15:45:58 -0700695#endif /* included_vector_sse2_h */
Dave Barachc3799992016-08-15 11:12:27 -0400696
697/*
698 * fd.io coding-style-patch-verification: ON
699 *
700 * Local Variables:
701 * eval: (c-set-style "gnu")
702 * End:
703 */