blob: 6830d5c610426111959f5601aedd4c4686805d01 [file] [log] [blame]
Ed Warnickecb9cada2015-12-08 15:45:58 -07001/*
2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15/*
16 Copyright (c) 2005 Eliot Dresselhaus
17
18 Permission is hereby granted, free of charge, to any person obtaining
19 a copy of this software and associated documentation files (the
20 "Software"), to deal in the Software without restriction, including
21 without limitation the rights to use, copy, modify, merge, publish,
22 distribute, sublicense, and/or sell copies of the Software, and to
23 permit persons to whom the Software is furnished to do so, subject to
24 the following conditions:
25
26 The above copyright notice and this permission notice shall be
27 included in all copies or substantial portions of the Software.
28
29 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
33 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
34 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
35 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
36*/
37
38#ifndef included_vector_sse2_h
39#define included_vector_sse2_h
40
Dave Barachc3799992016-08-15 11:12:27 -040041#include <vppinfra/error_bootstrap.h> /* for ASSERT */
Ed Warnickecb9cada2015-12-08 15:45:58 -070042#include <x86intrin.h>
43
44/* 128 bit interleaves. */
Dave Barachc3799992016-08-15 11:12:27 -040045always_inline u8x16
46u8x16_interleave_hi (u8x16 a, u8x16 b)
47{
48 return (u8x16) _mm_unpackhi_epi8 ((__m128i) a, (__m128i) b);
49}
Ed Warnickecb9cada2015-12-08 15:45:58 -070050
Dave Barachc3799992016-08-15 11:12:27 -040051always_inline u8x16
52u8x16_interleave_lo (u8x16 a, u8x16 b)
53{
54 return (u8x16) _mm_unpacklo_epi8 ((__m128i) a, (__m128i) b);
55}
Ed Warnickecb9cada2015-12-08 15:45:58 -070056
Dave Barachc3799992016-08-15 11:12:27 -040057always_inline u16x8
58u16x8_interleave_hi (u16x8 a, u16x8 b)
59{
60 return (u16x8) _mm_unpackhi_epi16 ((__m128i) a, (__m128i) b);
61}
Ed Warnickecb9cada2015-12-08 15:45:58 -070062
Dave Barachc3799992016-08-15 11:12:27 -040063always_inline u16x8
64u16x8_interleave_lo (u16x8 a, u16x8 b)
65{
66 return (u16x8) _mm_unpacklo_epi16 ((__m128i) a, (__m128i) b);
67}
Ed Warnickecb9cada2015-12-08 15:45:58 -070068
Dave Barachc3799992016-08-15 11:12:27 -040069always_inline u32x4
70u32x4_interleave_hi (u32x4 a, u32x4 b)
71{
72 return (u32x4) _mm_unpackhi_epi32 ((__m128i) a, (__m128i) b);
73}
Ed Warnickecb9cada2015-12-08 15:45:58 -070074
Dave Barachc3799992016-08-15 11:12:27 -040075always_inline u32x4
76u32x4_interleave_lo (u32x4 a, u32x4 b)
77{
78 return (u32x4) _mm_unpacklo_epi32 ((__m128i) a, (__m128i) b);
79}
Ed Warnickecb9cada2015-12-08 15:45:58 -070080
Dave Barachc3799992016-08-15 11:12:27 -040081always_inline u64x2
82u64x2_interleave_hi (u64x2 a, u64x2 b)
83{
84 return (u64x2) _mm_unpackhi_epi64 ((__m128i) a, (__m128i) b);
85}
Ed Warnickecb9cada2015-12-08 15:45:58 -070086
Dave Barachc3799992016-08-15 11:12:27 -040087always_inline u64x2
88u64x2_interleave_lo (u64x2 a, u64x2 b)
89{
90 return (u64x2) _mm_unpacklo_epi64 ((__m128i) a, (__m128i) b);
91}
Ed Warnickecb9cada2015-12-08 15:45:58 -070092
93/* 64 bit interleaves. */
Dave Barachc3799992016-08-15 11:12:27 -040094always_inline u8x8
95u8x8_interleave_hi (u8x8 a, u8x8 b)
96{
97 return (u8x8) _m_punpckhbw ((__m64) a, (__m64) b);
98}
Ed Warnickecb9cada2015-12-08 15:45:58 -070099
Dave Barachc3799992016-08-15 11:12:27 -0400100always_inline u8x8
101u8x8_interleave_lo (u8x8 a, u8x8 b)
102{
103 return (u8x8) _m_punpcklbw ((__m64) a, (__m64) b);
104}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700105
Dave Barachc3799992016-08-15 11:12:27 -0400106always_inline u16x4
107u16x4_interleave_hi (u16x4 a, u16x4 b)
108{
109 return (u16x4) _m_punpckhwd ((__m64) a, (__m64) b);
110}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700111
Dave Barachc3799992016-08-15 11:12:27 -0400112always_inline u16x4
113u16x4_interleave_lo (u16x4 a, u16x4 b)
114{
115 return (u16x4) _m_punpcklwd ((__m64) a, (__m64) b);
116}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700117
Dave Barachc3799992016-08-15 11:12:27 -0400118always_inline u32x2
119u32x2_interleave_hi (u32x2 a, u32x2 b)
120{
121 return (u32x2) _m_punpckhdq ((__m64) a, (__m64) b);
122}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700123
Dave Barachc3799992016-08-15 11:12:27 -0400124always_inline u32x2
125u32x2_interleave_lo (u32x2 a, u32x2 b)
126{
127 return (u32x2) _m_punpckldq ((__m64) a, (__m64) b);
128}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700129
130/* 128 bit packs. */
Dave Barachc3799992016-08-15 11:12:27 -0400131always_inline u8x16
132u16x8_pack (u16x8 lo, u16x8 hi)
133{
134 return (u8x16) _mm_packus_epi16 ((__m128i) lo, (__m128i) hi);
135}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700136
Dave Barachc3799992016-08-15 11:12:27 -0400137always_inline i8x16
138i16x8_pack (i16x8 lo, i16x8 hi)
139{
140 return (i8x16) _mm_packs_epi16 ((__m128i) lo, (__m128i) hi);
141}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700142
Dave Barachc3799992016-08-15 11:12:27 -0400143always_inline u16x8
144u32x4_pack (u32x4 lo, u32x4 hi)
145{
146 return (u16x8) _mm_packs_epi32 ((__m128i) lo, (__m128i) hi);
147}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700148
149/* 64 bit packs. */
Dave Barachc3799992016-08-15 11:12:27 -0400150always_inline u8x8
151u16x4_pack (u16x4 lo, u16x4 hi)
152{
153 return (u8x8) _m_packuswb ((__m64) lo, (__m64) hi);
154}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700155
Dave Barachc3799992016-08-15 11:12:27 -0400156always_inline i8x8
157i16x4_pack (i16x4 lo, i16x4 hi)
158{
159 return (i8x8) _m_packsswb ((__m64) lo, (__m64) hi);
160}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700161
Dave Barachc3799992016-08-15 11:12:27 -0400162always_inline u16x4
163u32x2_pack (u32x2 lo, u32x2 hi)
164{
165 return (u16x4) _m_packssdw ((__m64) lo, (__m64) hi);
166}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700167
Dave Barachc3799992016-08-15 11:12:27 -0400168always_inline i16x4
169i32x2_pack (i32x2 lo, i32x2 hi)
170{
171 return (i16x4) _m_packssdw ((__m64) lo, (__m64) hi);
172}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700173
174/* Splats: replicate scalar value into vector. */
Dave Barachc3799992016-08-15 11:12:27 -0400175always_inline u64x2
176u64x2_splat (u64 a)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700177{
Damjan Marionbde55232017-05-09 17:45:50 +0200178 u64x2 x = { a, a };
Ed Warnickecb9cada2015-12-08 15:45:58 -0700179 return x;
180}
181
Dave Barachc3799992016-08-15 11:12:27 -0400182always_inline u32x4
183u32x4_splat (u32 a)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700184{
Damjan Marionbde55232017-05-09 17:45:50 +0200185 u32x4 x = { a, a, a, a };
Ed Warnickecb9cada2015-12-08 15:45:58 -0700186 return x;
187}
188
Dave Barachc3799992016-08-15 11:12:27 -0400189always_inline u16x8
190u16x8_splat (u16 a)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700191{
Damjan Marionbde55232017-05-09 17:45:50 +0200192 u16x8 x = { a, a, a, a, a, a, a, a };
193 return x;
Ed Warnickecb9cada2015-12-08 15:45:58 -0700194}
195
Dave Barachc3799992016-08-15 11:12:27 -0400196always_inline u8x16
197u8x16_splat (u8 a)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700198{
Damjan Marionbde55232017-05-09 17:45:50 +0200199 u8x16 x = { a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a };
200 return x;
Ed Warnickecb9cada2015-12-08 15:45:58 -0700201}
202
Dave Barachc3799992016-08-15 11:12:27 -0400203always_inline u32x2
204u32x2_splat (u32 a)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700205{
Damjan Marionbde55232017-05-09 17:45:50 +0200206 u32x2 x = { a, a };
Ed Warnickecb9cada2015-12-08 15:45:58 -0700207 return x;
Dave Barachc3799992016-08-15 11:12:27 -0400208}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700209
Dave Barachc3799992016-08-15 11:12:27 -0400210always_inline u16x4
211u16x4_splat (u16 a)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700212{
Damjan Marionbde55232017-05-09 17:45:50 +0200213 u16x4 x = { a, a, a, a };
214 return x;
Ed Warnickecb9cada2015-12-08 15:45:58 -0700215}
216
Dave Barachc3799992016-08-15 11:12:27 -0400217always_inline u8x8
218u8x8_splat (u8 a)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700219{
Damjan Marionbde55232017-05-09 17:45:50 +0200220 u8x8 x = { a, a, a, a, a, a, a, a };
221 return x;
Ed Warnickecb9cada2015-12-08 15:45:58 -0700222}
223
224#define i64x2_splat u64x2_splat
225#define i32x4_splat u32x4_splat
226#define i16x8_splat u16x8_splat
227#define i8x16_splat u8x16_splat
228#define i32x2_splat u32x2_splat
229#define i16x4_splat u16x4_splat
230#define i8x8_splat u8x8_splat
231
232#ifndef __ICC
Dave Barachc3799992016-08-15 11:12:27 -0400233always_inline u64x2
234u64x2_read_lo (u64x2 x, u64 * a)
235{
236 return (u64x2) _mm_loadl_pi ((__m128) x, (__m64 *) a);
237}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700238
Dave Barachc3799992016-08-15 11:12:27 -0400239always_inline u64x2
240u64x2_read_hi (u64x2 x, u64 * a)
241{
242 return (u64x2) _mm_loadh_pi ((__m128) x, (__m64 *) a);
243}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700244
Dave Barachc3799992016-08-15 11:12:27 -0400245always_inline void
246u64x2_write_lo (u64x2 x, u64 * a)
247{
248 _mm_storel_pi ((__m64 *) a, (__m128) x);
249}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700250
Dave Barachc3799992016-08-15 11:12:27 -0400251always_inline void
252u64x2_write_hi (u64x2 x, u64 * a)
253{
254 _mm_storeh_pi ((__m64 *) a, (__m128) x);
255}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700256#endif
257
258/* Unaligned loads/stores. */
259
260#define _(t) \
261 always_inline void t##_store_unaligned (t x, t * a) \
262 { _mm_storeu_si128 ((__m128i *) a, (__m128i) x); } \
263 always_inline t t##_load_unaligned (t * a) \
264 { return (t) _mm_loadu_si128 ((__m128i *) a); }
265
Dave Barachc3799992016-08-15 11:12:27 -0400266_(u8x16) _(u16x8) _(u32x4) _(u64x2) _(i8x16) _(i16x8) _(i32x4) _(i64x2)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700267#undef _
Ed Warnickecb9cada2015-12-08 15:45:58 -0700268#define _signed_binop(n,m,f,g) \
269 /* Unsigned */ \
270 always_inline u##n##x##m \
271 u##n##x##m##_##f (u##n##x##m x, u##n##x##m y) \
272 { return (u##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); } \
273 \
274 /* Signed */ \
275 always_inline i##n##x##m \
276 i##n##x##m##_##f (i##n##x##m x, i##n##x##m y) \
277 { return (i##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); }
Ed Warnickecb9cada2015-12-08 15:45:58 -0700278/* Addition/subtraction. */
Dave Barachc3799992016-08-15 11:12:27 -0400279 _signed_binop (8, 16, add, add_epi)
280_signed_binop (16, 8, add, add_epi)
281_signed_binop (32, 4, add, add_epi)
282_signed_binop (64, 2, add, add_epi)
283_signed_binop (8, 16, sub, sub_epi)
284_signed_binop (16, 8, sub, sub_epi)
285_signed_binop (32, 4, sub, sub_epi) _signed_binop (64, 2, sub, sub_epi)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700286/* Addition/subtraction with saturation. */
Dave Barachc3799992016-08-15 11:12:27 -0400287 _signed_binop (8, 16, add_saturate, adds_epu)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700288_signed_binop (16, 8, add_saturate, adds_epu)
289_signed_binop (8, 16, sub_saturate, subs_epu)
290_signed_binop (16, 8, sub_saturate, subs_epu)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700291/* Multiplication. */
Dave Barachc3799992016-08-15 11:12:27 -0400292 always_inline i16x8 i16x8_mul_lo (i16x8 x, i16x8 y)
293{
294 return (i16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
295}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700296
Dave Barachc3799992016-08-15 11:12:27 -0400297always_inline u16x8
298u16x8_mul_lo (u16x8 x, u16x8 y)
299{
300 return (u16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
301}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700302
Dave Barachc3799992016-08-15 11:12:27 -0400303always_inline i16x8
304i16x8_mul_hi (i16x8 x, i16x8 y)
305{
306 return (i16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
307}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700308
Dave Barachc3799992016-08-15 11:12:27 -0400309always_inline u16x8
310u16x8_mul_hi (u16x8 x, u16x8 y)
311{
312 return (u16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
313}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700314
315/* 128 bit shifts. */
316
317#define _(p,a,b,c,f) \
318 always_inline p##a##x##b p##a##x##b##_ishift_##c (p##a##x##b x, int i) \
319 { return (p##a##x##b) _mm_##f##i_epi##a ((__m128i) x, i); } \
320 \
321 always_inline p##a##x##b p##a##x##b##_shift_##c (p##a##x##b x, p##a##x##b y) \
322 { return (p##a##x##b) _mm_##f##_epi##a ((__m128i) x, (__m128i) y); }
323
Dave Barachc3799992016-08-15 11:12:27 -0400324_(u, 16, 8, left, sll)
325_(u, 32, 4, left, sll)
326_(u, 64, 2, left, sll)
327_(u, 16, 8, right, srl)
328_(u, 32, 4, right, srl)
329_(u, 64, 2, right, srl)
330_(i, 16, 8, left, sll)
331_(i, 32, 4, left, sll)
332_(i, 64, 2, left, sll) _(i, 16, 8, right, sra) _(i, 32, 4, right, sra)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700333#undef _
Ed Warnickecb9cada2015-12-08 15:45:58 -0700334/* 64 bit shifts. */
Dave Barachc3799992016-08-15 11:12:27 -0400335 always_inline u16x4
336u16x4_shift_left (u16x4 x, u16x4 i)
337{
338 return (u16x4) _m_psllw ((__m64) x, (__m64) i);
339};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700340
Dave Barachc3799992016-08-15 11:12:27 -0400341always_inline u32x2
342u32x2_shift_left (u32x2 x, u32x2 i)
343{
344 return (u32x2) _m_pslld ((__m64) x, (__m64) i);
345};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700346
Dave Barachc3799992016-08-15 11:12:27 -0400347always_inline u16x4
348u16x4_shift_right (u16x4 x, u16x4 i)
349{
350 return (u16x4) _m_psrlw ((__m64) x, (__m64) i);
351};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700352
Dave Barachc3799992016-08-15 11:12:27 -0400353always_inline u32x2
354u32x2_shift_right (u32x2 x, u32x2 i)
355{
356 return (u32x2) _m_psrld ((__m64) x, (__m64) i);
357};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700358
Dave Barachc3799992016-08-15 11:12:27 -0400359always_inline i16x4
360i16x4_shift_left (i16x4 x, i16x4 i)
361{
362 return (i16x4) _m_psllw ((__m64) x, (__m64) i);
363};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700364
Dave Barachc3799992016-08-15 11:12:27 -0400365always_inline i32x2
366i32x2_shift_left (i32x2 x, i32x2 i)
367{
368 return (i32x2) _m_pslld ((__m64) x, (__m64) i);
369};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700370
Dave Barachc3799992016-08-15 11:12:27 -0400371always_inline i16x4
372i16x4_shift_right (i16x4 x, i16x4 i)
373{
374 return (i16x4) _m_psraw ((__m64) x, (__m64) i);
375};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700376
Dave Barachc3799992016-08-15 11:12:27 -0400377always_inline i32x2
378i32x2_shift_right (i32x2 x, i32x2 i)
379{
380 return (i32x2) _m_psrad ((__m64) x, (__m64) i);
381};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700382
383#define u8x16_word_shift_left(a,n) (u8x16) _mm_slli_si128((__m128i) a, n)
384#define u8x16_word_shift_right(a,n) (u8x16) _mm_srli_si128((__m128i) a, n)
385
386#define i8x16_word_shift_left(a,n) \
387 ((i8x16) u8x16_word_shift_left((u8x16) (a), (n)))
388#define i8x16_word_shift_right(a,n) \
389 ((i8x16) u8x16_word_shift_right((u8x16) (a), (n)))
390
391#define u16x8_word_shift_left(a,n) \
392 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
393#define i16x8_word_shift_left(a,n) \
394 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
395#define u16x8_word_shift_right(a,n) \
396 ((u16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
397#define i16x8_word_shift_right(a,n) \
398 ((i16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
399
400#define u32x4_word_shift_left(a,n) \
401 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
402#define i32x4_word_shift_left(a,n) \
403 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
404#define u32x4_word_shift_right(a,n) \
405 ((u32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
406#define i32x4_word_shift_right(a,n) \
407 ((i32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
408
409#define u64x2_word_shift_left(a,n) \
410 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
411#define i64x2_word_shift_left(a,n) \
412 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
413#define u64x2_word_shift_right(a,n) \
414 ((u64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
415#define i64x2_word_shift_right(a,n) \
416 ((i64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
417
418/* SSE2 has no rotate instructions: use shifts to simulate them. */
419#define _(t,n,lr1,lr2) \
420 always_inline t##x##n \
421 t##x##n##_irotate_##lr1 (t##x##n w, int i) \
422 { \
423 ASSERT (i >= 0 && i <= BITS (t)); \
424 return (t##x##n##_ishift_##lr1 (w, i) \
425 | t##x##n##_ishift_##lr2 (w, BITS (t) - i)); \
426 } \
427 \
428 always_inline t##x##n \
429 t##x##n##_rotate_##lr1 (t##x##n w, t##x##n i) \
430 { \
431 t##x##n j = t##x##n##_splat (BITS (t)); \
432 return (t##x##n##_shift_##lr1 (w, i) \
433 | t##x##n##_shift_##lr2 (w, j - i)); \
434 }
435
Dave Barachc3799992016-08-15 11:12:27 -0400436_(u16, 8, left, right);
437_(u16, 8, right, left);
438_(u32, 4, left, right);
439_(u32, 4, right, left);
440_(u64, 2, left, right);
441_(u64, 2, right, left);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700442
443#undef _
444
445#ifndef __clang__
446#define _(t,n,lr1,lr2) \
447 always_inline t##x##n \
448 t##x##n##_word_rotate2_##lr1 (t##x##n w0, t##x##n w1, int i) \
449 { \
450 int m = sizeof (t##x##n) / sizeof (t); \
451 ASSERT (i >= 0 && i < m); \
452 return (t##x##n##_word_shift_##lr1 (w0, i) \
453 | t##x##n##_word_shift_##lr2 (w1, m - i)); \
454 } \
455 \
456 always_inline t##x##n \
457 t##x##n##_word_rotate_##lr1 (t##x##n w0, int i) \
458 { return t##x##n##_word_rotate2_##lr1 (w0, w0, i); }
459
Dave Barachc3799992016-08-15 11:12:27 -0400460_(u8, 16, left, right);
461_(u8, 16, right, left);
462_(u16, 8, left, right);
463_(u16, 8, right, left);
464_(u32, 4, left, right);
465_(u32, 4, right, left);
466_(u64, 2, left, right);
467_(u64, 2, right, left);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700468
469#undef _
470#endif
471
472/* Compare operations. */
Dave Barachc3799992016-08-15 11:12:27 -0400473always_inline u8x16
474u8x16_is_equal (u8x16 x, u8x16 y)
475{
476 return (u8x16) _mm_cmpeq_epi8 ((__m128i) x, (__m128i) y);
477}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700478
Dave Barachc3799992016-08-15 11:12:27 -0400479always_inline i8x16
480i8x16_is_equal (i8x16 x, i8x16 y)
481{
482 return (i8x16) _mm_cmpeq_epi8 ((__m128i) x, (__m128i) y);
483}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700484
Dave Barachc3799992016-08-15 11:12:27 -0400485always_inline u16x8
486u16x8_is_equal (u16x8 x, u16x8 y)
487{
488 return (u16x8) _mm_cmpeq_epi16 ((__m128i) x, (__m128i) y);
489}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700490
Dave Barachc3799992016-08-15 11:12:27 -0400491always_inline i16x8
492i16x8_is_equal (i16x8 x, i16x8 y)
493{
494 return (i16x8) _mm_cmpeq_epi16 ((__m128i) x, (__m128i) y);
495}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700496
Dave Barachc3799992016-08-15 11:12:27 -0400497always_inline u32x4
498u32x4_is_equal (u32x4 x, u32x4 y)
499{
500 return (u32x4) _mm_cmpeq_epi32 ((__m128i) x, (__m128i) y);
501}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700502
Dave Barachc3799992016-08-15 11:12:27 -0400503always_inline i32x4
504i32x4_is_equal (i32x4 x, i32x4 y)
505{
506 return (i32x4) _mm_cmpeq_epi32 ((__m128i) x, (__m128i) y);
507}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700508
509always_inline u8x16
510i8x16_is_greater (i8x16 x, i8x16 y)
Dave Barachc3799992016-08-15 11:12:27 -0400511{
512 return (u8x16) _mm_cmpgt_epi8 ((__m128i) x, (__m128i) y);
513}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700514
515always_inline u16x8
516i16x8_is_greater (i16x8 x, i16x8 y)
Dave Barachc3799992016-08-15 11:12:27 -0400517{
518 return (u16x8) _mm_cmpgt_epi16 ((__m128i) x, (__m128i) y);
519}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700520
521always_inline u32x4
522i32x4_is_greater (i32x4 x, i32x4 y)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700523{
Dave Barachc3799992016-08-15 11:12:27 -0400524 return (u32x4) _mm_cmpgt_epi32 ((__m128i) x, (__m128i) y);
525}
526
527always_inline u8x16
528u8x16_is_zero (u8x16 x)
529{
530 u8x16 zero = { 0 };
Ed Warnickecb9cada2015-12-08 15:45:58 -0700531 return u8x16_is_equal (x, zero);
532}
533
Dave Barachc3799992016-08-15 11:12:27 -0400534always_inline u16x8
535u16x8_is_zero (u16x8 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700536{
Dave Barachc3799992016-08-15 11:12:27 -0400537 u16x8 zero = { 0 };
Ed Warnickecb9cada2015-12-08 15:45:58 -0700538 return u16x8_is_equal (x, zero);
539}
540
Dave Barachc3799992016-08-15 11:12:27 -0400541always_inline u32x4
542u32x4_is_zero (u32x4 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700543{
Dave Barachc3799992016-08-15 11:12:27 -0400544 u32x4 zero = { 0 };
Ed Warnickecb9cada2015-12-08 15:45:58 -0700545 return u32x4_is_equal (x, zero);
546}
547
548#define u32x4_select(A,MASK) \
549({ \
550 u32x4 _x, _y; \
551 _x = (A); \
552 asm volatile ("pshufd %[mask], %[x], %[y]" \
553 : /* outputs */ [y] "=x" (_y) \
554 : /* inputs */ [x] "x" (_x), [mask] "i" (MASK)); \
555 _y; \
556})
557
558#define u32x4_splat_word(x,i) \
559 u32x4_select ((x), (((i) << (2*0)) \
560 | ((i) << (2*1)) \
561 | ((i) << (2*2)) \
562 | ((i) << (2*3))))
563
564/* Extract low order 32 bit word. */
565always_inline u32
566u32x4_get0 (u32x4 x)
567{
568 u32 result;
Dave Barachc3799992016-08-15 11:12:27 -0400569 asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=r" (result)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700570 : /* inputs */ [x] "x" (x));
571 return result;
572}
573
574always_inline u32x4
575u32x4_set0 (u32 x)
576{
577 u32x4 result;
Dave Barachc3799992016-08-15 11:12:27 -0400578 asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=x" (result)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700579 : /* inputs */ [x] "r" (x));
580 return result;
581}
582
583always_inline i32x4
584i32x4_set0 (i32 x)
Dave Barachc3799992016-08-15 11:12:27 -0400585{
586 return (i32x4) u32x4_set0 ((u32) x);
587}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700588
589always_inline i32
590i32x4_get0 (i32x4 x)
Dave Barachc3799992016-08-15 11:12:27 -0400591{
592 return (i32) u32x4_get0 ((u32x4) x);
593}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700594
595/* Converts all ones/zeros compare mask to bitmap. */
Dave Barachc3799992016-08-15 11:12:27 -0400596always_inline u32
597u8x16_compare_byte_mask (u8x16 x)
598{
599 return _mm_movemask_epi8 ((__m128i) x);
600}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700601
Damjan Marionb8abf872016-03-14 20:02:35 +0100602extern u8 u32x4_compare_word_mask_table[256];
Ed Warnickecb9cada2015-12-08 15:45:58 -0700603
Dave Barachc3799992016-08-15 11:12:27 -0400604always_inline u32
605u32x4_compare_word_mask (u32x4 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700606{
607 u32 m = u8x16_compare_byte_mask ((u8x16) x);
608 return (u32x4_compare_word_mask_table[(m >> 0) & 0xff]
609 | (u32x4_compare_word_mask_table[(m >> 8) & 0xff] << 2));
610}
611
Dave Barachc3799992016-08-15 11:12:27 -0400612always_inline u32
613u8x16_zero_byte_mask (u8x16 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700614{
Dave Barachc3799992016-08-15 11:12:27 -0400615 u8x16 zero = { 0 };
Ed Warnickecb9cada2015-12-08 15:45:58 -0700616 return u8x16_compare_byte_mask (u8x16_is_equal (x, zero));
617}
618
Dave Barachc3799992016-08-15 11:12:27 -0400619always_inline u32
620u16x8_zero_byte_mask (u16x8 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700621{
Dave Barachc3799992016-08-15 11:12:27 -0400622 u16x8 zero = { 0 };
Ed Warnickecb9cada2015-12-08 15:45:58 -0700623 return u8x16_compare_byte_mask ((u8x16) u16x8_is_equal (x, zero));
624}
625
Dave Barachc3799992016-08-15 11:12:27 -0400626always_inline u32
627u32x4_zero_byte_mask (u32x4 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700628{
Dave Barachc3799992016-08-15 11:12:27 -0400629 u32x4 zero = { 0 };
Ed Warnickecb9cada2015-12-08 15:45:58 -0700630 return u8x16_compare_byte_mask ((u8x16) u32x4_is_equal (x, zero));
631}
632
Dave Barachc3799992016-08-15 11:12:27 -0400633always_inline u8x16
634u8x16_max (u8x16 x, u8x16 y)
635{
636 return (u8x16) _mm_max_epu8 ((__m128i) x, (__m128i) y);
637}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700638
Dave Barachc3799992016-08-15 11:12:27 -0400639always_inline u32
640u8x16_max_scalar (u8x16 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700641{
642 x = u8x16_max (x, u8x16_word_shift_right (x, 8));
643 x = u8x16_max (x, u8x16_word_shift_right (x, 4));
644 x = u8x16_max (x, u8x16_word_shift_right (x, 2));
645 x = u8x16_max (x, u8x16_word_shift_right (x, 1));
646 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
647}
648
Dave Barachc3799992016-08-15 11:12:27 -0400649always_inline u8x16
650u8x16_min (u8x16 x, u8x16 y)
651{
652 return (u8x16) _mm_min_epu8 ((__m128i) x, (__m128i) y);
653}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700654
Dave Barachc3799992016-08-15 11:12:27 -0400655always_inline u8
656u8x16_min_scalar (u8x16 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700657{
658 x = u8x16_min (x, u8x16_word_shift_right (x, 8));
659 x = u8x16_min (x, u8x16_word_shift_right (x, 4));
660 x = u8x16_min (x, u8x16_word_shift_right (x, 2));
661 x = u8x16_min (x, u8x16_word_shift_right (x, 1));
662 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
663}
664
Dave Barachc3799992016-08-15 11:12:27 -0400665always_inline i16x8
666i16x8_max (i16x8 x, i16x8 y)
667{
668 return (i16x8) _mm_max_epi16 ((__m128i) x, (__m128i) y);
669}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700670
Dave Barachc3799992016-08-15 11:12:27 -0400671always_inline i16
672i16x8_max_scalar (i16x8 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700673{
674 x = i16x8_max (x, i16x8_word_shift_right (x, 4));
675 x = i16x8_max (x, i16x8_word_shift_right (x, 2));
676 x = i16x8_max (x, i16x8_word_shift_right (x, 1));
677 return _mm_extract_epi16 ((__m128i) x, 0);
678}
679
Dave Barachc3799992016-08-15 11:12:27 -0400680always_inline i16x8
681i16x8_min (i16x8 x, i16x8 y)
682{
683 return (i16x8) _mm_min_epi16 ((__m128i) x, (__m128i) y);
684}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700685
Dave Barachc3799992016-08-15 11:12:27 -0400686always_inline i16
687i16x8_min_scalar (i16x8 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700688{
689 x = i16x8_min (x, i16x8_word_shift_right (x, 4));
690 x = i16x8_min (x, i16x8_word_shift_right (x, 2));
691 x = i16x8_min (x, i16x8_word_shift_right (x, 1));
692 return _mm_extract_epi16 ((__m128i) x, 0);
693}
694
695#undef _signed_binop
696
697#endif /* included_vector_sse2_h */
Dave Barachc3799992016-08-15 11:12:27 -0400698
699/*
700 * fd.io coding-style-patch-verification: ON
701 *
702 * Local Variables:
703 * eval: (c-set-style "gnu")
704 * End:
705 */