blob: f782e8fd4093aeda8d686b49783a4f28972d02a5 [file] [log] [blame]
Ed Warnickecb9cada2015-12-08 15:45:58 -07001/*
2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15/*
16 Copyright (c) 2005 Eliot Dresselhaus
17
18 Permission is hereby granted, free of charge, to any person obtaining
19 a copy of this software and associated documentation files (the
20 "Software"), to deal in the Software without restriction, including
21 without limitation the rights to use, copy, modify, merge, publish,
22 distribute, sublicense, and/or sell copies of the Software, and to
23 permit persons to whom the Software is furnished to do so, subject to
24 the following conditions:
25
26 The above copyright notice and this permission notice shall be
27 included in all copies or substantial portions of the Software.
28
29 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
33 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
34 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
35 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
36*/
37
38#ifndef included_vector_sse2_h
39#define included_vector_sse2_h
40
Dave Barachc3799992016-08-15 11:12:27 -040041#include <vppinfra/error_bootstrap.h> /* for ASSERT */
Ed Warnickecb9cada2015-12-08 15:45:58 -070042#include <x86intrin.h>
43
44/* 128 bit interleaves. */
Dave Barachc3799992016-08-15 11:12:27 -040045always_inline u8x16
46u8x16_interleave_hi (u8x16 a, u8x16 b)
47{
48 return (u8x16) _mm_unpackhi_epi8 ((__m128i) a, (__m128i) b);
49}
Ed Warnickecb9cada2015-12-08 15:45:58 -070050
Dave Barachc3799992016-08-15 11:12:27 -040051always_inline u8x16
52u8x16_interleave_lo (u8x16 a, u8x16 b)
53{
54 return (u8x16) _mm_unpacklo_epi8 ((__m128i) a, (__m128i) b);
55}
Ed Warnickecb9cada2015-12-08 15:45:58 -070056
Dave Barachc3799992016-08-15 11:12:27 -040057always_inline u16x8
58u16x8_interleave_hi (u16x8 a, u16x8 b)
59{
60 return (u16x8) _mm_unpackhi_epi16 ((__m128i) a, (__m128i) b);
61}
Ed Warnickecb9cada2015-12-08 15:45:58 -070062
Dave Barachc3799992016-08-15 11:12:27 -040063always_inline u16x8
64u16x8_interleave_lo (u16x8 a, u16x8 b)
65{
66 return (u16x8) _mm_unpacklo_epi16 ((__m128i) a, (__m128i) b);
67}
Ed Warnickecb9cada2015-12-08 15:45:58 -070068
Dave Barachc3799992016-08-15 11:12:27 -040069always_inline u32x4
70u32x4_interleave_hi (u32x4 a, u32x4 b)
71{
72 return (u32x4) _mm_unpackhi_epi32 ((__m128i) a, (__m128i) b);
73}
Ed Warnickecb9cada2015-12-08 15:45:58 -070074
Dave Barachc3799992016-08-15 11:12:27 -040075always_inline u32x4
76u32x4_interleave_lo (u32x4 a, u32x4 b)
77{
78 return (u32x4) _mm_unpacklo_epi32 ((__m128i) a, (__m128i) b);
79}
Ed Warnickecb9cada2015-12-08 15:45:58 -070080
Dave Barachc3799992016-08-15 11:12:27 -040081always_inline u64x2
82u64x2_interleave_hi (u64x2 a, u64x2 b)
83{
84 return (u64x2) _mm_unpackhi_epi64 ((__m128i) a, (__m128i) b);
85}
Ed Warnickecb9cada2015-12-08 15:45:58 -070086
Dave Barachc3799992016-08-15 11:12:27 -040087always_inline u64x2
88u64x2_interleave_lo (u64x2 a, u64x2 b)
89{
90 return (u64x2) _mm_unpacklo_epi64 ((__m128i) a, (__m128i) b);
91}
Ed Warnickecb9cada2015-12-08 15:45:58 -070092
93/* 64 bit interleaves. */
Dave Barachc3799992016-08-15 11:12:27 -040094always_inline u8x8
95u8x8_interleave_hi (u8x8 a, u8x8 b)
96{
97 return (u8x8) _m_punpckhbw ((__m64) a, (__m64) b);
98}
Ed Warnickecb9cada2015-12-08 15:45:58 -070099
Dave Barachc3799992016-08-15 11:12:27 -0400100always_inline u8x8
101u8x8_interleave_lo (u8x8 a, u8x8 b)
102{
103 return (u8x8) _m_punpcklbw ((__m64) a, (__m64) b);
104}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700105
Dave Barachc3799992016-08-15 11:12:27 -0400106always_inline u16x4
107u16x4_interleave_hi (u16x4 a, u16x4 b)
108{
109 return (u16x4) _m_punpckhwd ((__m64) a, (__m64) b);
110}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700111
Dave Barachc3799992016-08-15 11:12:27 -0400112always_inline u16x4
113u16x4_interleave_lo (u16x4 a, u16x4 b)
114{
115 return (u16x4) _m_punpcklwd ((__m64) a, (__m64) b);
116}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700117
Dave Barachc3799992016-08-15 11:12:27 -0400118always_inline u32x2
119u32x2_interleave_hi (u32x2 a, u32x2 b)
120{
121 return (u32x2) _m_punpckhdq ((__m64) a, (__m64) b);
122}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700123
Dave Barachc3799992016-08-15 11:12:27 -0400124always_inline u32x2
125u32x2_interleave_lo (u32x2 a, u32x2 b)
126{
127 return (u32x2) _m_punpckldq ((__m64) a, (__m64) b);
128}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700129
130/* 128 bit packs. */
Dave Barachc3799992016-08-15 11:12:27 -0400131always_inline u8x16
132u16x8_pack (u16x8 lo, u16x8 hi)
133{
134 return (u8x16) _mm_packus_epi16 ((__m128i) lo, (__m128i) hi);
135}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700136
Dave Barachc3799992016-08-15 11:12:27 -0400137always_inline i8x16
138i16x8_pack (i16x8 lo, i16x8 hi)
139{
140 return (i8x16) _mm_packs_epi16 ((__m128i) lo, (__m128i) hi);
141}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700142
Dave Barachc3799992016-08-15 11:12:27 -0400143always_inline u16x8
144u32x4_pack (u32x4 lo, u32x4 hi)
145{
146 return (u16x8) _mm_packs_epi32 ((__m128i) lo, (__m128i) hi);
147}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700148
149/* 64 bit packs. */
Dave Barachc3799992016-08-15 11:12:27 -0400150always_inline u8x8
151u16x4_pack (u16x4 lo, u16x4 hi)
152{
153 return (u8x8) _m_packuswb ((__m64) lo, (__m64) hi);
154}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700155
Dave Barachc3799992016-08-15 11:12:27 -0400156always_inline i8x8
157i16x4_pack (i16x4 lo, i16x4 hi)
158{
159 return (i8x8) _m_packsswb ((__m64) lo, (__m64) hi);
160}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700161
Dave Barachc3799992016-08-15 11:12:27 -0400162always_inline u16x4
163u32x2_pack (u32x2 lo, u32x2 hi)
164{
165 return (u16x4) _m_packssdw ((__m64) lo, (__m64) hi);
166}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700167
Dave Barachc3799992016-08-15 11:12:27 -0400168always_inline i16x4
169i32x2_pack (i32x2 lo, i32x2 hi)
170{
171 return (i16x4) _m_packssdw ((__m64) lo, (__m64) hi);
172}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700173
174/* Splats: replicate scalar value into vector. */
Dave Barachc3799992016-08-15 11:12:27 -0400175always_inline u64x2
176u64x2_splat (u64 a)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700177{
Dave Barachc3799992016-08-15 11:12:27 -0400178 u64x2 x = { a };
Ed Warnickecb9cada2015-12-08 15:45:58 -0700179 x = u64x2_interleave_lo (x, x);
180 return x;
181}
182
Dave Barachc3799992016-08-15 11:12:27 -0400183always_inline u32x4
184u32x4_splat (u32 a)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700185{
Dave Barachc3799992016-08-15 11:12:27 -0400186 u32x4 x = { a };
Ed Warnickecb9cada2015-12-08 15:45:58 -0700187 x = u32x4_interleave_lo (x, x);
188 x = (u32x4) u64x2_interleave_lo ((u64x2) x, (u64x2) x);
189 return x;
190}
191
Dave Barachc3799992016-08-15 11:12:27 -0400192always_inline u16x8
193u16x8_splat (u16 a)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700194{
195 u32 t = (u32) a | ((u32) a << 16);
196 return (u16x8) u32x4_splat (t);
197}
198
Dave Barachc3799992016-08-15 11:12:27 -0400199always_inline u8x16
200u8x16_splat (u8 a)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700201{
202 u32 t = (u32) a | ((u32) a << 8);
203 t |= t << 16;
204 return (u8x16) u16x8_splat (t);
205}
206
Dave Barachc3799992016-08-15 11:12:27 -0400207always_inline u32x2
208u32x2_splat (u32 a)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700209{
Dave Barachc3799992016-08-15 11:12:27 -0400210 u32x2 x = { a };
Ed Warnickecb9cada2015-12-08 15:45:58 -0700211 x = u32x2_interleave_lo (x, x);
212 return x;
Dave Barachc3799992016-08-15 11:12:27 -0400213}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700214
Dave Barachc3799992016-08-15 11:12:27 -0400215always_inline u16x4
216u16x4_splat (u16 a)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700217{
218 u32 t = (u32) a | ((u32) a << 16);
219 return (u16x4) u32x2_splat (t);
220}
221
Dave Barachc3799992016-08-15 11:12:27 -0400222always_inline u8x8
223u8x8_splat (u8 a)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700224{
225 u32 t = (u32) a | ((u32) a << 8);
226 t |= t << 16;
227 return (u8x8) u32x2_splat (t);
228}
229
230#define i64x2_splat u64x2_splat
231#define i32x4_splat u32x4_splat
232#define i16x8_splat u16x8_splat
233#define i8x16_splat u8x16_splat
234#define i32x2_splat u32x2_splat
235#define i16x4_splat u16x4_splat
236#define i8x8_splat u8x8_splat
237
238#ifndef __ICC
Dave Barachc3799992016-08-15 11:12:27 -0400239always_inline u64x2
240u64x2_read_lo (u64x2 x, u64 * a)
241{
242 return (u64x2) _mm_loadl_pi ((__m128) x, (__m64 *) a);
243}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700244
Dave Barachc3799992016-08-15 11:12:27 -0400245always_inline u64x2
246u64x2_read_hi (u64x2 x, u64 * a)
247{
248 return (u64x2) _mm_loadh_pi ((__m128) x, (__m64 *) a);
249}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700250
Dave Barachc3799992016-08-15 11:12:27 -0400251always_inline void
252u64x2_write_lo (u64x2 x, u64 * a)
253{
254 _mm_storel_pi ((__m64 *) a, (__m128) x);
255}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700256
Dave Barachc3799992016-08-15 11:12:27 -0400257always_inline void
258u64x2_write_hi (u64x2 x, u64 * a)
259{
260 _mm_storeh_pi ((__m64 *) a, (__m128) x);
261}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700262#endif
263
264/* Unaligned loads/stores. */
265
266#define _(t) \
267 always_inline void t##_store_unaligned (t x, t * a) \
268 { _mm_storeu_si128 ((__m128i *) a, (__m128i) x); } \
269 always_inline t t##_load_unaligned (t * a) \
270 { return (t) _mm_loadu_si128 ((__m128i *) a); }
271
Dave Barachc3799992016-08-15 11:12:27 -0400272_(u8x16) _(u16x8) _(u32x4) _(u64x2) _(i8x16) _(i16x8) _(i32x4) _(i64x2)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700273#undef _
Ed Warnickecb9cada2015-12-08 15:45:58 -0700274#define _signed_binop(n,m,f,g) \
275 /* Unsigned */ \
276 always_inline u##n##x##m \
277 u##n##x##m##_##f (u##n##x##m x, u##n##x##m y) \
278 { return (u##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); } \
279 \
280 /* Signed */ \
281 always_inline i##n##x##m \
282 i##n##x##m##_##f (i##n##x##m x, i##n##x##m y) \
283 { return (i##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); }
Ed Warnickecb9cada2015-12-08 15:45:58 -0700284/* Addition/subtraction. */
Dave Barachc3799992016-08-15 11:12:27 -0400285 _signed_binop (8, 16, add, add_epi)
286_signed_binop (16, 8, add, add_epi)
287_signed_binop (32, 4, add, add_epi)
288_signed_binop (64, 2, add, add_epi)
289_signed_binop (8, 16, sub, sub_epi)
290_signed_binop (16, 8, sub, sub_epi)
291_signed_binop (32, 4, sub, sub_epi) _signed_binop (64, 2, sub, sub_epi)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700292/* Addition/subtraction with saturation. */
Dave Barachc3799992016-08-15 11:12:27 -0400293 _signed_binop (8, 16, add_saturate, adds_epu)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700294_signed_binop (16, 8, add_saturate, adds_epu)
295_signed_binop (8, 16, sub_saturate, subs_epu)
296_signed_binop (16, 8, sub_saturate, subs_epu)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700297/* Multiplication. */
Dave Barachc3799992016-08-15 11:12:27 -0400298 always_inline i16x8 i16x8_mul_lo (i16x8 x, i16x8 y)
299{
300 return (i16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
301}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700302
Dave Barachc3799992016-08-15 11:12:27 -0400303always_inline u16x8
304u16x8_mul_lo (u16x8 x, u16x8 y)
305{
306 return (u16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
307}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700308
Dave Barachc3799992016-08-15 11:12:27 -0400309always_inline i16x8
310i16x8_mul_hi (i16x8 x, i16x8 y)
311{
312 return (i16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
313}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700314
Dave Barachc3799992016-08-15 11:12:27 -0400315always_inline u16x8
316u16x8_mul_hi (u16x8 x, u16x8 y)
317{
318 return (u16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
319}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700320
321/* 128 bit shifts. */
322
323#define _(p,a,b,c,f) \
324 always_inline p##a##x##b p##a##x##b##_ishift_##c (p##a##x##b x, int i) \
325 { return (p##a##x##b) _mm_##f##i_epi##a ((__m128i) x, i); } \
326 \
327 always_inline p##a##x##b p##a##x##b##_shift_##c (p##a##x##b x, p##a##x##b y) \
328 { return (p##a##x##b) _mm_##f##_epi##a ((__m128i) x, (__m128i) y); }
329
Dave Barachc3799992016-08-15 11:12:27 -0400330_(u, 16, 8, left, sll)
331_(u, 32, 4, left, sll)
332_(u, 64, 2, left, sll)
333_(u, 16, 8, right, srl)
334_(u, 32, 4, right, srl)
335_(u, 64, 2, right, srl)
336_(i, 16, 8, left, sll)
337_(i, 32, 4, left, sll)
338_(i, 64, 2, left, sll) _(i, 16, 8, right, sra) _(i, 32, 4, right, sra)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700339#undef _
Ed Warnickecb9cada2015-12-08 15:45:58 -0700340/* 64 bit shifts. */
Dave Barachc3799992016-08-15 11:12:27 -0400341 always_inline u16x4
342u16x4_shift_left (u16x4 x, u16x4 i)
343{
344 return (u16x4) _m_psllw ((__m64) x, (__m64) i);
345};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700346
Dave Barachc3799992016-08-15 11:12:27 -0400347always_inline u32x2
348u32x2_shift_left (u32x2 x, u32x2 i)
349{
350 return (u32x2) _m_pslld ((__m64) x, (__m64) i);
351};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700352
Dave Barachc3799992016-08-15 11:12:27 -0400353always_inline u16x4
354u16x4_shift_right (u16x4 x, u16x4 i)
355{
356 return (u16x4) _m_psrlw ((__m64) x, (__m64) i);
357};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700358
Dave Barachc3799992016-08-15 11:12:27 -0400359always_inline u32x2
360u32x2_shift_right (u32x2 x, u32x2 i)
361{
362 return (u32x2) _m_psrld ((__m64) x, (__m64) i);
363};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700364
Dave Barachc3799992016-08-15 11:12:27 -0400365always_inline i16x4
366i16x4_shift_left (i16x4 x, i16x4 i)
367{
368 return (i16x4) _m_psllw ((__m64) x, (__m64) i);
369};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700370
Dave Barachc3799992016-08-15 11:12:27 -0400371always_inline i32x2
372i32x2_shift_left (i32x2 x, i32x2 i)
373{
374 return (i32x2) _m_pslld ((__m64) x, (__m64) i);
375};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700376
Dave Barachc3799992016-08-15 11:12:27 -0400377always_inline i16x4
378i16x4_shift_right (i16x4 x, i16x4 i)
379{
380 return (i16x4) _m_psraw ((__m64) x, (__m64) i);
381};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700382
Dave Barachc3799992016-08-15 11:12:27 -0400383always_inline i32x2
384i32x2_shift_right (i32x2 x, i32x2 i)
385{
386 return (i32x2) _m_psrad ((__m64) x, (__m64) i);
387};
Ed Warnickecb9cada2015-12-08 15:45:58 -0700388
389#define u8x16_word_shift_left(a,n) (u8x16) _mm_slli_si128((__m128i) a, n)
390#define u8x16_word_shift_right(a,n) (u8x16) _mm_srli_si128((__m128i) a, n)
391
392#define i8x16_word_shift_left(a,n) \
393 ((i8x16) u8x16_word_shift_left((u8x16) (a), (n)))
394#define i8x16_word_shift_right(a,n) \
395 ((i8x16) u8x16_word_shift_right((u8x16) (a), (n)))
396
397#define u16x8_word_shift_left(a,n) \
398 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
399#define i16x8_word_shift_left(a,n) \
400 ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
401#define u16x8_word_shift_right(a,n) \
402 ((u16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
403#define i16x8_word_shift_right(a,n) \
404 ((i16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
405
406#define u32x4_word_shift_left(a,n) \
407 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
408#define i32x4_word_shift_left(a,n) \
409 ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
410#define u32x4_word_shift_right(a,n) \
411 ((u32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
412#define i32x4_word_shift_right(a,n) \
413 ((i32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
414
415#define u64x2_word_shift_left(a,n) \
416 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
417#define i64x2_word_shift_left(a,n) \
418 ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
419#define u64x2_word_shift_right(a,n) \
420 ((u64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
421#define i64x2_word_shift_right(a,n) \
422 ((i64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
423
424/* SSE2 has no rotate instructions: use shifts to simulate them. */
425#define _(t,n,lr1,lr2) \
426 always_inline t##x##n \
427 t##x##n##_irotate_##lr1 (t##x##n w, int i) \
428 { \
429 ASSERT (i >= 0 && i <= BITS (t)); \
430 return (t##x##n##_ishift_##lr1 (w, i) \
431 | t##x##n##_ishift_##lr2 (w, BITS (t) - i)); \
432 } \
433 \
434 always_inline t##x##n \
435 t##x##n##_rotate_##lr1 (t##x##n w, t##x##n i) \
436 { \
437 t##x##n j = t##x##n##_splat (BITS (t)); \
438 return (t##x##n##_shift_##lr1 (w, i) \
439 | t##x##n##_shift_##lr2 (w, j - i)); \
440 }
441
Dave Barachc3799992016-08-15 11:12:27 -0400442_(u16, 8, left, right);
443_(u16, 8, right, left);
444_(u32, 4, left, right);
445_(u32, 4, right, left);
446_(u64, 2, left, right);
447_(u64, 2, right, left);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700448
449#undef _
450
451#ifndef __clang__
452#define _(t,n,lr1,lr2) \
453 always_inline t##x##n \
454 t##x##n##_word_rotate2_##lr1 (t##x##n w0, t##x##n w1, int i) \
455 { \
456 int m = sizeof (t##x##n) / sizeof (t); \
457 ASSERT (i >= 0 && i < m); \
458 return (t##x##n##_word_shift_##lr1 (w0, i) \
459 | t##x##n##_word_shift_##lr2 (w1, m - i)); \
460 } \
461 \
462 always_inline t##x##n \
463 t##x##n##_word_rotate_##lr1 (t##x##n w0, int i) \
464 { return t##x##n##_word_rotate2_##lr1 (w0, w0, i); }
465
Dave Barachc3799992016-08-15 11:12:27 -0400466_(u8, 16, left, right);
467_(u8, 16, right, left);
468_(u16, 8, left, right);
469_(u16, 8, right, left);
470_(u32, 4, left, right);
471_(u32, 4, right, left);
472_(u64, 2, left, right);
473_(u64, 2, right, left);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700474
475#undef _
476#endif
477
478/* Compare operations. */
Dave Barachc3799992016-08-15 11:12:27 -0400479always_inline u8x16
480u8x16_is_equal (u8x16 x, u8x16 y)
481{
482 return (u8x16) _mm_cmpeq_epi8 ((__m128i) x, (__m128i) y);
483}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700484
Dave Barachc3799992016-08-15 11:12:27 -0400485always_inline i8x16
486i8x16_is_equal (i8x16 x, i8x16 y)
487{
488 return (i8x16) _mm_cmpeq_epi8 ((__m128i) x, (__m128i) y);
489}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700490
Dave Barachc3799992016-08-15 11:12:27 -0400491always_inline u16x8
492u16x8_is_equal (u16x8 x, u16x8 y)
493{
494 return (u16x8) _mm_cmpeq_epi16 ((__m128i) x, (__m128i) y);
495}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700496
Dave Barachc3799992016-08-15 11:12:27 -0400497always_inline i16x8
498i16x8_is_equal (i16x8 x, i16x8 y)
499{
500 return (i16x8) _mm_cmpeq_epi16 ((__m128i) x, (__m128i) y);
501}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700502
Dave Barachc3799992016-08-15 11:12:27 -0400503always_inline u32x4
504u32x4_is_equal (u32x4 x, u32x4 y)
505{
506 return (u32x4) _mm_cmpeq_epi32 ((__m128i) x, (__m128i) y);
507}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700508
Dave Barachc3799992016-08-15 11:12:27 -0400509always_inline i32x4
510i32x4_is_equal (i32x4 x, i32x4 y)
511{
512 return (i32x4) _mm_cmpeq_epi32 ((__m128i) x, (__m128i) y);
513}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700514
515always_inline u8x16
516i8x16_is_greater (i8x16 x, i8x16 y)
Dave Barachc3799992016-08-15 11:12:27 -0400517{
518 return (u8x16) _mm_cmpgt_epi8 ((__m128i) x, (__m128i) y);
519}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700520
521always_inline u16x8
522i16x8_is_greater (i16x8 x, i16x8 y)
Dave Barachc3799992016-08-15 11:12:27 -0400523{
524 return (u16x8) _mm_cmpgt_epi16 ((__m128i) x, (__m128i) y);
525}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700526
527always_inline u32x4
528i32x4_is_greater (i32x4 x, i32x4 y)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700529{
Dave Barachc3799992016-08-15 11:12:27 -0400530 return (u32x4) _mm_cmpgt_epi32 ((__m128i) x, (__m128i) y);
531}
532
533always_inline u8x16
534u8x16_is_zero (u8x16 x)
535{
536 u8x16 zero = { 0 };
Ed Warnickecb9cada2015-12-08 15:45:58 -0700537 return u8x16_is_equal (x, zero);
538}
539
Dave Barachc3799992016-08-15 11:12:27 -0400540always_inline u16x8
541u16x8_is_zero (u16x8 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700542{
Dave Barachc3799992016-08-15 11:12:27 -0400543 u16x8 zero = { 0 };
Ed Warnickecb9cada2015-12-08 15:45:58 -0700544 return u16x8_is_equal (x, zero);
545}
546
Dave Barachc3799992016-08-15 11:12:27 -0400547always_inline u32x4
548u32x4_is_zero (u32x4 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700549{
Dave Barachc3799992016-08-15 11:12:27 -0400550 u32x4 zero = { 0 };
Ed Warnickecb9cada2015-12-08 15:45:58 -0700551 return u32x4_is_equal (x, zero);
552}
553
554#define u32x4_select(A,MASK) \
555({ \
556 u32x4 _x, _y; \
557 _x = (A); \
558 asm volatile ("pshufd %[mask], %[x], %[y]" \
559 : /* outputs */ [y] "=x" (_y) \
560 : /* inputs */ [x] "x" (_x), [mask] "i" (MASK)); \
561 _y; \
562})
563
564#define u32x4_splat_word(x,i) \
565 u32x4_select ((x), (((i) << (2*0)) \
566 | ((i) << (2*1)) \
567 | ((i) << (2*2)) \
568 | ((i) << (2*3))))
569
570/* Extract low order 32 bit word. */
571always_inline u32
572u32x4_get0 (u32x4 x)
573{
574 u32 result;
Dave Barachc3799992016-08-15 11:12:27 -0400575 asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=r" (result)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700576 : /* inputs */ [x] "x" (x));
577 return result;
578}
579
580always_inline u32x4
581u32x4_set0 (u32 x)
582{
583 u32x4 result;
Dave Barachc3799992016-08-15 11:12:27 -0400584 asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=x" (result)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700585 : /* inputs */ [x] "r" (x));
586 return result;
587}
588
589always_inline i32x4
590i32x4_set0 (i32 x)
Dave Barachc3799992016-08-15 11:12:27 -0400591{
592 return (i32x4) u32x4_set0 ((u32) x);
593}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700594
595always_inline i32
596i32x4_get0 (i32x4 x)
Dave Barachc3799992016-08-15 11:12:27 -0400597{
598 return (i32) u32x4_get0 ((u32x4) x);
599}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700600
601/* Converts all ones/zeros compare mask to bitmap. */
Dave Barachc3799992016-08-15 11:12:27 -0400602always_inline u32
603u8x16_compare_byte_mask (u8x16 x)
604{
605 return _mm_movemask_epi8 ((__m128i) x);
606}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700607
Damjan Marionb8abf872016-03-14 20:02:35 +0100608extern u8 u32x4_compare_word_mask_table[256];
Ed Warnickecb9cada2015-12-08 15:45:58 -0700609
Dave Barachc3799992016-08-15 11:12:27 -0400610always_inline u32
611u32x4_compare_word_mask (u32x4 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700612{
613 u32 m = u8x16_compare_byte_mask ((u8x16) x);
614 return (u32x4_compare_word_mask_table[(m >> 0) & 0xff]
615 | (u32x4_compare_word_mask_table[(m >> 8) & 0xff] << 2));
616}
617
Dave Barachc3799992016-08-15 11:12:27 -0400618always_inline u32
619u8x16_zero_byte_mask (u8x16 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700620{
Dave Barachc3799992016-08-15 11:12:27 -0400621 u8x16 zero = { 0 };
Ed Warnickecb9cada2015-12-08 15:45:58 -0700622 return u8x16_compare_byte_mask (u8x16_is_equal (x, zero));
623}
624
Dave Barachc3799992016-08-15 11:12:27 -0400625always_inline u32
626u16x8_zero_byte_mask (u16x8 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700627{
Dave Barachc3799992016-08-15 11:12:27 -0400628 u16x8 zero = { 0 };
Ed Warnickecb9cada2015-12-08 15:45:58 -0700629 return u8x16_compare_byte_mask ((u8x16) u16x8_is_equal (x, zero));
630}
631
Dave Barachc3799992016-08-15 11:12:27 -0400632always_inline u32
633u32x4_zero_byte_mask (u32x4 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700634{
Dave Barachc3799992016-08-15 11:12:27 -0400635 u32x4 zero = { 0 };
Ed Warnickecb9cada2015-12-08 15:45:58 -0700636 return u8x16_compare_byte_mask ((u8x16) u32x4_is_equal (x, zero));
637}
638
Dave Barachc3799992016-08-15 11:12:27 -0400639always_inline u8x16
640u8x16_max (u8x16 x, u8x16 y)
641{
642 return (u8x16) _mm_max_epu8 ((__m128i) x, (__m128i) y);
643}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700644
Dave Barachc3799992016-08-15 11:12:27 -0400645always_inline u32
646u8x16_max_scalar (u8x16 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700647{
648 x = u8x16_max (x, u8x16_word_shift_right (x, 8));
649 x = u8x16_max (x, u8x16_word_shift_right (x, 4));
650 x = u8x16_max (x, u8x16_word_shift_right (x, 2));
651 x = u8x16_max (x, u8x16_word_shift_right (x, 1));
652 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
653}
654
Dave Barachc3799992016-08-15 11:12:27 -0400655always_inline u8x16
656u8x16_min (u8x16 x, u8x16 y)
657{
658 return (u8x16) _mm_min_epu8 ((__m128i) x, (__m128i) y);
659}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700660
Dave Barachc3799992016-08-15 11:12:27 -0400661always_inline u8
662u8x16_min_scalar (u8x16 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700663{
664 x = u8x16_min (x, u8x16_word_shift_right (x, 8));
665 x = u8x16_min (x, u8x16_word_shift_right (x, 4));
666 x = u8x16_min (x, u8x16_word_shift_right (x, 2));
667 x = u8x16_min (x, u8x16_word_shift_right (x, 1));
668 return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
669}
670
Dave Barachc3799992016-08-15 11:12:27 -0400671always_inline i16x8
672i16x8_max (i16x8 x, i16x8 y)
673{
674 return (i16x8) _mm_max_epi16 ((__m128i) x, (__m128i) y);
675}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700676
Dave Barachc3799992016-08-15 11:12:27 -0400677always_inline i16
678i16x8_max_scalar (i16x8 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700679{
680 x = i16x8_max (x, i16x8_word_shift_right (x, 4));
681 x = i16x8_max (x, i16x8_word_shift_right (x, 2));
682 x = i16x8_max (x, i16x8_word_shift_right (x, 1));
683 return _mm_extract_epi16 ((__m128i) x, 0);
684}
685
Dave Barachc3799992016-08-15 11:12:27 -0400686always_inline i16x8
687i16x8_min (i16x8 x, i16x8 y)
688{
689 return (i16x8) _mm_min_epi16 ((__m128i) x, (__m128i) y);
690}
Ed Warnickecb9cada2015-12-08 15:45:58 -0700691
Dave Barachc3799992016-08-15 11:12:27 -0400692always_inline i16
693i16x8_min_scalar (i16x8 x)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700694{
695 x = i16x8_min (x, i16x8_word_shift_right (x, 4));
696 x = i16x8_min (x, i16x8_word_shift_right (x, 2));
697 x = i16x8_min (x, i16x8_word_shift_right (x, 1));
698 return _mm_extract_epi16 ((__m128i) x, 0);
699}
700
701#undef _signed_binop
702
703#endif /* included_vector_sse2_h */
Dave Barachc3799992016-08-15 11:12:27 -0400704
705/*
706 * fd.io coding-style-patch-verification: ON
707 *
708 * Local Variables:
709 * eval: (c-set-style "gnu")
710 * End:
711 */