blob: dab22deff7c943bc2d32547a681bdfd241c5976c [file] [log] [blame]
Ed Warnickecb9cada2015-12-08 15:45:58 -07001/*
2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15/*
16 Copyright (c) 2005 Eliot Dresselhaus
17
18 Permission is hereby granted, free of charge, to any person obtaining
19 a copy of this software and associated documentation files (the
20 "Software"), to deal in the Software without restriction, including
21 without limitation the rights to use, copy, modify, merge, publish,
22 distribute, sublicense, and/or sell copies of the Software, and to
23 permit persons to whom the Software is furnished to do so, subject to
24 the following conditions:
25
26 The above copyright notice and this permission notice shall be
27 included in all copies or substantial portions of the Software.
28
29 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
33 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
34 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
35 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
36*/
37
38#ifndef included_vector_sse2_h
39#define included_vector_sse2_h
40
Dave Barachc3799992016-08-15 11:12:27 -040041#include <vppinfra/error_bootstrap.h> /* for ASSERT */
Ed Warnickecb9cada2015-12-08 15:45:58 -070042#include <x86intrin.h>
43
44/* 128 bit interleaves. */
Dave Barachc3799992016-08-15 11:12:27 -040045always_inline u8x16
46u8x16_interleave_hi (u8x16 a, u8x16 b)
47{
48 return (u8x16) _mm_unpackhi_epi8 ((__m128i) a, (__m128i) b);
49}
Ed Warnickecb9cada2015-12-08 15:45:58 -070050
Dave Barachc3799992016-08-15 11:12:27 -040051always_inline u8x16
52u8x16_interleave_lo (u8x16 a, u8x16 b)
53{
54 return (u8x16) _mm_unpacklo_epi8 ((__m128i) a, (__m128i) b);
55}
Ed Warnickecb9cada2015-12-08 15:45:58 -070056
Dave Barachc3799992016-08-15 11:12:27 -040057always_inline u16x8
58u16x8_interleave_hi (u16x8 a, u16x8 b)
59{
60 return (u16x8) _mm_unpackhi_epi16 ((__m128i) a, (__m128i) b);
61}
Ed Warnickecb9cada2015-12-08 15:45:58 -070062
Dave Barachc3799992016-08-15 11:12:27 -040063always_inline u16x8
64u16x8_interleave_lo (u16x8 a, u16x8 b)
65{
66 return (u16x8) _mm_unpacklo_epi16 ((__m128i) a, (__m128i) b);
67}
Ed Warnickecb9cada2015-12-08 15:45:58 -070068
Dave Barachc3799992016-08-15 11:12:27 -040069always_inline u32x4
70u32x4_interleave_hi (u32x4 a, u32x4 b)
71{
72 return (u32x4) _mm_unpackhi_epi32 ((__m128i) a, (__m128i) b);
73}
Ed Warnickecb9cada2015-12-08 15:45:58 -070074
Dave Barachc3799992016-08-15 11:12:27 -040075always_inline u32x4
76u32x4_interleave_lo (u32x4 a, u32x4 b)
77{
78 return (u32x4) _mm_unpacklo_epi32 ((__m128i) a, (__m128i) b);
79}
Ed Warnickecb9cada2015-12-08 15:45:58 -070080
Dave Barachc3799992016-08-15 11:12:27 -040081always_inline u64x2
82u64x2_interleave_hi (u64x2 a, u64x2 b)
83{
84 return (u64x2) _mm_unpackhi_epi64 ((__m128i) a, (__m128i) b);
85}
Ed Warnickecb9cada2015-12-08 15:45:58 -070086
Dave Barachc3799992016-08-15 11:12:27 -040087always_inline u64x2
88u64x2_interleave_lo (u64x2 a, u64x2 b)
89{
90 return (u64x2) _mm_unpacklo_epi64 ((__m128i) a, (__m128i) b);
91}
Ed Warnickecb9cada2015-12-08 15:45:58 -070092
93/* 64 bit interleaves. */
Dave Barachc3799992016-08-15 11:12:27 -040094always_inline u8x8
95u8x8_interleave_hi (u8x8 a, u8x8 b)
96{
97 return (u8x8) _m_punpckhbw ((__m64) a, (__m64) b);
98}
Ed Warnickecb9cada2015-12-08 15:45:58 -070099
Dave Barachc3799992016-08-15 11:12:27 -0400100always_inline u8x8
101u8x8_interleave_lo (u8x8 a, u8x8 b)
102{
103 return (u8x8) _m_punpcklbw ((__m64) a, (__m64) b);
104}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700105
Dave Barachc3799992016-08-15 11:12:27 -0400106always_inline u16x4
107u16x4_interleave_hi (u16x4 a, u16x4 b)
108{
109 return (u16x4) _m_punpckhwd ((__m64) a, (__m64) b);
110}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700111
Dave Barachc3799992016-08-15 11:12:27 -0400112always_inline u16x4
113u16x4_interleave_lo (u16x4 a, u16x4 b)
114{
115 return (u16x4) _m_punpcklwd ((__m64) a, (__m64) b);
116}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700117
Dave Barachc3799992016-08-15 11:12:27 -0400118always_inline u32x2
119u32x2_interleave_hi (u32x2 a, u32x2 b)
120{
121 return (u32x2) _m_punpckhdq ((__m64) a, (__m64) b);
122}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700123
Dave Barachc3799992016-08-15 11:12:27 -0400124always_inline u32x2
125u32x2_interleave_lo (u32x2 a, u32x2 b)
126{
127 return (u32x2) _m_punpckldq ((__m64) a, (__m64) b);
128}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700129
130/* 128 bit packs. */
Dave Barachc3799992016-08-15 11:12:27 -0400131always_inline u8x16
132u16x8_pack (u16x8 lo, u16x8 hi)
133{
134 return (u8x16) _mm_packus_epi16 ((__m128i) lo, (__m128i) hi);
135}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700136
Dave Barachc3799992016-08-15 11:12:27 -0400137always_inline i8x16
138i16x8_pack (i16x8 lo, i16x8 hi)
139{
140 return (i8x16) _mm_packs_epi16 ((__m128i) lo, (__m128i) hi);
141}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700142
Dave Barachc3799992016-08-15 11:12:27 -0400143always_inline u16x8
144u32x4_pack (u32x4 lo, u32x4 hi)
145{
146 return (u16x8) _mm_packs_epi32 ((__m128i) lo, (__m128i) hi);
147}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700148
149/* 64 bit packs. */
Dave Barachc3799992016-08-15 11:12:27 -0400150always_inline u8x8
151u16x4_pack (u16x4 lo, u16x4 hi)
152{
153 return (u8x8) _m_packuswb ((__m64) lo, (__m64) hi);
154}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700155
Dave Barachc3799992016-08-15 11:12:27 -0400156always_inline i8x8
157i16x4_pack (i16x4 lo, i16x4 hi)
158{
159 return (i8x8) _m_packsswb ((__m64) lo, (__m64) hi);
160}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700161
Dave Barachc3799992016-08-15 11:12:27 -0400162always_inline u16x4
163u32x2_pack (u32x2 lo, u32x2 hi)
164{
165 return (u16x4) _m_packssdw ((__m64) lo, (__m64) hi);
166}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700167
Dave Barachc3799992016-08-15 11:12:27 -0400168always_inline i16x4
169i32x2_pack (i32x2 lo, i32x2 hi)
170{
171 return (i16x4) _m_packssdw ((__m64) lo, (__m64) hi);
172}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700173
Ed Warnickecb9cada2015-12-08 15:45:58 -0700174#ifndef __ICC
Dave Barachc3799992016-08-15 11:12:27 -0400175always_inline u64x2
176u64x2_read_lo (u64x2 x, u64 * a)
177{
178 return (u64x2) _mm_loadl_pi ((__m128) x, (__m64 *) a);
179}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700180
Dave Barachc3799992016-08-15 11:12:27 -0400181always_inline u64x2
182u64x2_read_hi (u64x2 x, u64 * a)
183{
184 return (u64x2) _mm_loadh_pi ((__m128) x, (__m64 *) a);
185}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700186
Dave Barachc3799992016-08-15 11:12:27 -0400187always_inline void
188u64x2_write_lo (u64x2 x, u64 * a)
189{
190 _mm_storel_pi ((__m64 *) a, (__m128) x);
191}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700192
Dave Barachc3799992016-08-15 11:12:27 -0400193always_inline void
194u64x2_write_hi (u64x2 x, u64 * a)
195{
196 _mm_storeh_pi ((__m64 *) a, (__m128) x);
197}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700198#endif
199
200/* Unaligned loads/stores. */
201
202#define _(t) \
203 always_inline void t##_store_unaligned (t x, t * a) \
204 { _mm_storeu_si128 ((__m128i *) a, (__m128i) x); } \
205 always_inline t t##_load_unaligned (t * a) \
206 { return (t) _mm_loadu_si128 ((__m128i *) a); }
207
Dave Barachc3799992016-08-15 11:12:27 -0400208_(u8x16) _(u16x8) _(u32x4) _(u64x2) _(i8x16) _(i16x8) _(i32x4) _(i64x2)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700209#undef _
Ed Warnickecb9cada2015-12-08 15:45:58 -0700210#define _signed_binop(n,m,f,g) \
211 /* Unsigned */ \
212 always_inline u##n##x##m \
213 u##n##x##m##_##f (u##n##x##m x, u##n##x##m y) \
214 { return (u##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); } \
215 \
216 /* Signed */ \
217 always_inline i##n##x##m \
218 i##n##x##m##_##f (i##n##x##m x, i##n##x##m y) \
219 { return (i##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); }
Ed Warnickecb9cada2015-12-08 15:45:58 -0700220/* Addition/subtraction with saturation. */
Dave Barachc3799992016-08-15 11:12:27 -0400221 _signed_binop (8, 16, add_saturate, adds_epu)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700222_signed_binop (16, 8, add_saturate, adds_epu)
223_signed_binop (8, 16, sub_saturate, subs_epu)
224_signed_binop (16, 8, sub_saturate, subs_epu)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700225/* Multiplication. */
Dave Barachc3799992016-08-15 11:12:27 -0400226 always_inline i16x8 i16x8_mul_lo (i16x8 x, i16x8 y)
227{
228 return (i16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
229}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700230
Dave Barachc3799992016-08-15 11:12:27 -0400231always_inline u16x8
232u16x8_mul_lo (u16x8 x, u16x8 y)
233{
234 return (u16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
235}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700236
Dave Barachc3799992016-08-15 11:12:27 -0400237always_inline i16x8
238i16x8_mul_hi (i16x8 x, i16x8 y)
239{
240 return (i16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
241}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700242
Dave Barachc3799992016-08-15 11:12:27 -0400243always_inline u16x8
244u16x8_mul_hi (u16x8 x, u16x8 y)
245{
246 return (u16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
247}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700248
249/* 128 bit shifts. */
250
251#define _(p,a,b,c,f) \
252 always_inline p##a##x##b p##a##x##b##_ishift_##c (p##a##x##b x, int i) \
253 { return (p##a##x##b) _mm_##f##i_epi##a ((__m128i) x, i); } \
254 \
255 always_inline p##a##x##b p##a##x##b##_shift_##c (p##a##x##b x, p##a##x##b y) \
256 { return (p##a##x##b) _mm_##f##_epi##a ((__m128i) x, (__m128i) y); }
257
Dave Barachc3799992016-08-15 11:12:27 -0400258_(u, 16, 8, left, sll)
259_(u, 32, 4, left, sll)
260_(u, 64, 2, left, sll)
261_(u, 16, 8, right, srl)
262_(u, 32, 4, right, srl)
263_(u, 64, 2, right, srl)
264_(i, 16, 8, left, sll)
265_(i, 32, 4, left, sll)
266_(i, 64, 2, left, sll) _(i, 16, 8, right, sra) _(i, 32, 4, right, sra)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700267#undef _
Ed Warnickecb9cada2015-12-08 15:45:58 -0700268/* 64 bit shifts. */
Dave Barachc3799992016-08-15 11:12:27 -0400269 always_inline u16x4
270u16x4_shift_left (u16x4 x, u16x4 i)
271{
272 return (u16x4) _m_psllw ((__m64) x, (__m64) i);
273};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700274
Dave Barachc3799992016-08-15 11:12:27 -0400275always_inline u32x2
276u32x2_shift_left (u32x2 x, u32x2 i)
277{
278 return (u32x2) _m_pslld ((__m64) x, (__m64) i);
279};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700280
Dave Barachc3799992016-08-15 11:12:27 -0400281always_inline u16x4
282u16x4_shift_right (u16x4 x, u16x4 i)
283{
284 return (u16x4) _m_psrlw ((__m64) x, (__m64) i);
285};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700286
Dave Barachc3799992016-08-15 11:12:27 -0400287always_inline u32x2
288u32x2_shift_right (u32x2 x, u32x2 i)
289{
290 return (u32x2) _m_psrld ((__m64) x, (__m64) i);
291};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700292
Dave Barachc3799992016-08-15 11:12:27 -0400293always_inline i16x4
294i16x4_shift_left (i16x4 x, i16x4 i)
295{
296 return (i16x4) _m_psllw ((__m64) x, (__m64) i);
297};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700298
Dave Barachc3799992016-08-15 11:12:27 -0400299always_inline i32x2
300i32x2_shift_left (i32x2 x, i32x2 i)
301{
302 return (i32x2) _m_pslld ((__m64) x, (__m64) i);
303};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700304
Dave Barachc3799992016-08-15 11:12:27 -0400305always_inline i16x4
306i16x4_shift_right (i16x4 x, i16x4 i)
307{
308 return (i16x4) _m_psraw ((__m64) x, (__m64) i);
309};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700310
Dave Barachc3799992016-08-15 11:12:27 -0400311always_inline i32x2
312i32x2_shift_right (i32x2 x, i32x2 i)
313{
314 return (i32x2) _m_psrad ((__m64) x, (__m64) i);
315};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700316
317#define u8x16_word_shift_left(a,n) (u8x16) _mm_slli_si128((__m128i) a, n)
318#define u8x16_word_shift_right(a,n) (u8x16) _mm_srli_si128((__m128i) a, n)
319
320#define i8x16_word_shift_left(a,n) \
321 ((i8x16) u8x16_word_shift_left((u8x16) (a), (n)))
322#define i8x16_word_shift_right(a,n) \
323 ((i8x16) u8x16_word_shift_right((u8x16) (a), (n)))
324
325#define u16x8_word_shift_left(a,n) \
326 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
327#define i16x8_word_shift_left(a,n) \
328 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
329#define u16x8_word_shift_right(a,n) \
330 ((u16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
331#define i16x8_word_shift_right(a,n) \
332 ((i16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
333
334#define u32x4_word_shift_left(a,n) \
335 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
336#define i32x4_word_shift_left(a,n) \
337 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
338#define u32x4_word_shift_right(a,n) \
339 ((u32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
340#define i32x4_word_shift_right(a,n) \
341 ((i32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
342
343#define u64x2_word_shift_left(a,n) \
344 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
345#define i64x2_word_shift_left(a,n) \
346 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
347#define u64x2_word_shift_right(a,n) \
348 ((u64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
349#define i64x2_word_shift_right(a,n) \
350 ((i64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
351
352/* SSE2 has no rotate instructions: use shifts to simulate them. */
353#define _(t,n,lr1,lr2) \
354 always_inline t##x##n \
355 t##x##n##_irotate_##lr1 (t##x##n w, int i) \
356 { \
357 ASSERT (i >= 0 && i <= BITS (t)); \
358 return (t##x##n##_ishift_##lr1 (w, i) \
359 | t##x##n##_ishift_##lr2 (w, BITS (t) - i)); \
360 } \
361 \
362 always_inline t##x##n \
363 t##x##n##_rotate_##lr1 (t##x##n w, t##x##n i) \
364 { \
365 t##x##n j = t##x##n##_splat (BITS (t)); \
366 return (t##x##n##_shift_##lr1 (w, i) \
367 | t##x##n##_shift_##lr2 (w, j - i)); \
368 }
369
Dave Barachc3799992016-08-15 11:12:27 -0400370_(u16, 8, left, right);
371_(u16, 8, right, left);
372_(u32, 4, left, right);
373_(u32, 4, right, left);
374_(u64, 2, left, right);
375_(u64, 2, right, left);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700376
377#undef _
378
379#ifndef __clang__
380#define _(t,n,lr1,lr2) \
381 always_inline t##x##n \
382 t##x##n##_word_rotate2_##lr1 (t##x##n w0, t##x##n w1, int i) \
383 { \
384 int m = sizeof (t##x##n) / sizeof (t); \
385 ASSERT (i >= 0 && i < m); \
386 return (t##x##n##_word_shift_##lr1 (w0, i) \
387 | t##x##n##_word_shift_##lr2 (w1, m - i)); \
388 } \
389 \
390 always_inline t##x##n \
391 t##x##n##_word_rotate_##lr1 (t##x##n w0, int i) \
392 { return t##x##n##_word_rotate2_##lr1 (w0, w0, i); }
393
Dave Barachc3799992016-08-15 11:12:27 -0400394_(u8, 16, left, right);
395_(u8, 16, right, left);
396_(u16, 8, left, right);
397_(u16, 8, right, left);
398_(u32, 4, left, right);
399_(u32, 4, right, left);
400_(u64, 2, left, right);
401_(u64, 2, right, left);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700402
403#undef _
404#endif
405
Damjan Marionc6969b52018-02-19 12:14:06 +0100406always_inline int
407u8x16_is_all_zero (u8x16 x)
408{
409 return _mm_testz_si128 ((__m128i) x, (__m128i) x);
410}
411
412always_inline int
413u16x8_is_all_zero (u16x8 x)
414{
415 return _mm_testz_si128 ((__m128i) x, (__m128i) x);
416}
417
418always_inline int
419u32x4_is_all_zero (u32x4 x)
420{
421 return _mm_testz_si128 ((__m128i) x, (__m128i) x);
422}
423
424always_inline int
425u64x2_is_all_zero (u64x2 x)
426{
427 return _mm_testz_si128 ((__m128i) x, (__m128i) x);
428}
429
Ed Warnickecb9cada2015-12-08 15:45:58 -0700430#define u32x4_select(A,MASK) \
431({ \
432 u32x4 _x, _y; \
433 _x = (A); \
434 asm volatile ("pshufd %[mask], %[x], %[y]" \
435 : /* outputs */ [y] "=x" (_y) \
436 : /* inputs */ [x] "x" (_x), [mask] "i" (MASK)); \
437 _y; \
438})
439
440#define u32x4_splat_word(x,i) \
441 u32x4_select ((x), (((i) << (2*0)) \
442 | ((i) << (2*1)) \
443 | ((i) << (2*2)) \
444 | ((i) << (2*3))))
445
446/* Extract low order 32 bit word. */
447always_inline u32
448u32x4_get0 (u32x4 x)
449{
450 u32 result;
Dave Barachc3799992016-08-15 11:12:27 -0400451 asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=r" (result)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700452 : /* inputs */ [x] "x" (x));
453 return result;
454}
455
456always_inline u32x4
457u32x4_set0 (u32 x)
458{
459 u32x4 result;
Dave Barachc3799992016-08-15 11:12:27 -0400460 asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=x" (result)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700461 : /* inputs */ [x] "r" (x));
462 return result;
463}
464
465always_inline i32x4
466i32x4_set0 (i32 x)
Dave Barachc3799992016-08-15 11:12:27 -0400467{
468 return (i32x4) u32x4_set0 ((u32) x);
469}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700470
471always_inline i32
472i32x4_get0 (i32x4 x)
Dave Barachc3799992016-08-15 11:12:27 -0400473{
474 return (i32) u32x4_get0 ((u32x4) x);
475}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700476
477/* Converts all ones/zeros compare mask to bitmap. */
Dave Barachc3799992016-08-15 11:12:27 -0400478always_inline u32
479u8x16_compare_byte_mask (u8x16 x)
480{
481 return _mm_movemask_epi8 ((__m128i) x);
482}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700483
Damjan Marionb8abf872016-03-14 20:02:35 +0100484extern u8 u32x4_compare_word_mask_table[256];
Ed Warnickecb9cada2015-12-08 15:45:58 -0700485
Dave Barachc3799992016-08-15 11:12:27 -0400486always_inline u32
487u32x4_compare_word_mask (u32x4 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700488{
489 u32 m = u8x16_compare_byte_mask ((u8x16) x);
490 return (u32x4_compare_word_mask_table[(m >> 0) & 0xff]
491 | (u32x4_compare_word_mask_table[(m >> 8) & 0xff] << 2));
492}
493
Dave Barachc3799992016-08-15 11:12:27 -0400494always_inline u32
495u8x16_zero_byte_mask (u8x16 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700496{
Dave Barachc3799992016-08-15 11:12:27 -0400497 u8x16 zero = { 0 };
Ed Warnickecb9cada2015-12-08 15:45:58 -0700498 return u8x16_compare_byte_mask (u8x16_is_equal (x, zero));
499}
500
Dave Barachc3799992016-08-15 11:12:27 -0400501always_inline u32
502u16x8_zero_byte_mask (u16x8 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700503{
Dave Barachc3799992016-08-15 11:12:27 -0400504 u16x8 zero = { 0 };
Ed Warnickecb9cada2015-12-08 15:45:58 -0700505 return u8x16_compare_byte_mask ((u8x16) u16x8_is_equal (x, zero));
506}
507
Dave Barachc3799992016-08-15 11:12:27 -0400508always_inline u32
509u32x4_zero_byte_mask (u32x4 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700510{
Dave Barachc3799992016-08-15 11:12:27 -0400511 u32x4 zero = { 0 };
Ed Warnickecb9cada2015-12-08 15:45:58 -0700512 return u8x16_compare_byte_mask ((u8x16) u32x4_is_equal (x, zero));
513}
514
Dave Barachc3799992016-08-15 11:12:27 -0400515always_inline u8x16
516u8x16_max (u8x16 x, u8x16 y)
517{
518 return (u8x16) _mm_max_epu8 ((__m128i) x, (__m128i) y);
519}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700520
Dave Barachc3799992016-08-15 11:12:27 -0400521always_inline u32
522u8x16_max_scalar (u8x16 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700523{
524 x = u8x16_max (x, u8x16_word_shift_right (x, 8));
525 x = u8x16_max (x, u8x16_word_shift_right (x, 4));
526 x = u8x16_max (x, u8x16_word_shift_right (x, 2));
527 x = u8x16_max (x, u8x16_word_shift_right (x, 1));
528 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
529}
530
Dave Barachc3799992016-08-15 11:12:27 -0400531always_inline u8x16
532u8x16_min (u8x16 x, u8x16 y)
533{
534 return (u8x16) _mm_min_epu8 ((__m128i) x, (__m128i) y);
535}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700536
Dave Barachc3799992016-08-15 11:12:27 -0400537always_inline u8
538u8x16_min_scalar (u8x16 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700539{
540 x = u8x16_min (x, u8x16_word_shift_right (x, 8));
541 x = u8x16_min (x, u8x16_word_shift_right (x, 4));
542 x = u8x16_min (x, u8x16_word_shift_right (x, 2));
543 x = u8x16_min (x, u8x16_word_shift_right (x, 1));
544 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
545}
546
Dave Barachc3799992016-08-15 11:12:27 -0400547always_inline i16x8
548i16x8_max (i16x8 x, i16x8 y)
549{
550 return (i16x8) _mm_max_epi16 ((__m128i) x, (__m128i) y);
551}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700552
Dave Barachc3799992016-08-15 11:12:27 -0400553always_inline i16
554i16x8_max_scalar (i16x8 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700555{
556 x = i16x8_max (x, i16x8_word_shift_right (x, 4));
557 x = i16x8_max (x, i16x8_word_shift_right (x, 2));
558 x = i16x8_max (x, i16x8_word_shift_right (x, 1));
559 return _mm_extract_epi16 ((__m128i) x, 0);
560}
561
Dave Barachc3799992016-08-15 11:12:27 -0400562always_inline i16x8
563i16x8_min (i16x8 x, i16x8 y)
564{
565 return (i16x8) _mm_min_epi16 ((__m128i) x, (__m128i) y);
566}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700567
Dave Barachc3799992016-08-15 11:12:27 -0400568always_inline i16
569i16x8_min_scalar (i16x8 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700570{
571 x = i16x8_min (x, i16x8_word_shift_right (x, 4));
572 x = i16x8_min (x, i16x8_word_shift_right (x, 2));
573 x = i16x8_min (x, i16x8_word_shift_right (x, 1));
574 return _mm_extract_epi16 ((__m128i) x, 0);
575}
576
577#undef _signed_binop
578
579#endif /* included_vector_sse2_h */
Dave Barachc3799992016-08-15 11:12:27 -0400580
581/*
582 * fd.io coding-style-patch-verification: ON
583 *
584 * Local Variables:
585 * eval: (c-set-style "gnu")
586 * End:
587 */