Blame - src/vppinfra/vector_avx2.h - fdio/vpp

blob: 584bd207b279bbdc606f1620c60b6e5736f611f3 [file] [log] [blame]

Damjan Marion	c576622	2018-04-16 00:18:34 +0200	[diff] [blame]	1	/*
				2	* Copyright (c) 2018 Cisco and/or its affiliates.
				3	* Licensed under the Apache License, Version 2.0 (the "License");
				4	* you may not use this file except in compliance with the License.
				5	* You may obtain a copy of the License at:
				6	*
				7	* http://www.apache.org/licenses/LICENSE-2.0
				8	*
				9	* Unless required by applicable law or agreed to in writing, software
				10	* distributed under the License is distributed on an "AS IS" BASIS,
				11	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	* See the License for the specific language governing permissions and
				13	* limitations under the License.
				14	*/
				15
				16	#ifndef included_vector_avx2_h
				17	#define included_vector_avx2_h
				18
				19	#include <vppinfra/clib.h>
				20	#include <x86intrin.h>
				21
Damjan Marion	a52e166	2018-05-19 00:04:23 +0200	[diff] [blame]	22	/* INDENT-OFF */
Damjan Marion	c576622	2018-04-16 00:18:34 +0200	[diff] [blame]	23	#define foreach_avx2_vec256i \
Damjan Marion	4fce7f7	2018-07-16 14:18:23 +0200	[diff] [blame]	24	_(i,8,32,epi8) _(i,16,16,epi16) _(i,32,8,epi32) _(i,64,4,epi64)
Damjan Marion	c576622	2018-04-16 00:18:34 +0200	[diff] [blame]	25	#define foreach_avx2_vec256u \
Damjan Marion	4fce7f7	2018-07-16 14:18:23 +0200	[diff] [blame]	26	_(u,8,32,epi8) _(u,16,16,epi16) _(u,32,8,epi32) _(u,64,4,epi64)
Damjan Marion	c576622	2018-04-16 00:18:34 +0200	[diff] [blame]	27	#define foreach_avx2_vec256f \
				28	_(f,32,8,ps) _(f,64,4,pd)
				29
Damjan Marion	4fce7f7	2018-07-16 14:18:23 +0200	[diff] [blame]	30	#define _mm256_set1_epi64 _mm256_set1_epi64x
				31
Damjan Marion	a52e166	2018-05-19 00:04:23 +0200	[diff] [blame]	32	/* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
				33	is_all_equal */
Damjan Marion	c576622	2018-04-16 00:18:34 +0200	[diff] [blame]	34	#define _(t, s, c, i) \
				35	static_always_inline t##s##x##c \
				36	t##s##x##c##_splat (t##s x) \
				37	{ return (t##s##x##c) _mm256_set1_##i (x); } \
				38	\
				39	static_always_inline t##s##x##c \
				40	t##s##x##c##_load_unaligned (void *p) \
				41	{ return (t##s##x##c) _mm256_loadu_si256 (p); } \
				42	\
				43	static_always_inline void \
				44	t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
				45	{ _mm256_storeu_si256 ((__m256i *) p, (__m256i) v); } \
				46	\
				47	static_always_inline int \
				48	t##s##x##c##_is_all_zero (t##s##x##c x) \
				49	{ return _mm256_testz_si256 ((__m256i) x, (__m256i) x); } \
				50	\
				51	static_always_inline int \
Damjan Marion	1486477	2018-05-22 14:07:47 +0200	[diff] [blame]	52	t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
				53	{ return t##s##x##c##_is_all_zero (a ^ b); } \
Damjan Marion	c576622	2018-04-16 00:18:34 +0200	[diff] [blame]	54	\
Damjan Marion	a52e166	2018-05-19 00:04:23 +0200	[diff] [blame]	55	static_always_inline int \
				56	t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
Damjan Marion	4fce7f7	2018-07-16 14:18:23 +0200	[diff] [blame]	57	{ return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); } \
				58	\
				59	static_always_inline t##s##x##c \
				60	t##s##x##c##_interleave_lo (t##s##x##c a, t##s##x##c b) \
				61	{ return (t##s##x##c) _mm256_unpacklo_##i ((__m256i) a, (__m256i) b); } \
				62	\
				63	static_always_inline t##s##x##c \
				64	t##s##x##c##_interleave_hi (t##s##x##c a, t##s##x##c b) \
				65	{ return (t##s##x##c) _mm256_unpackhi_##i ((__m256i) a, (__m256i) b); } \
				66
Damjan Marion	c576622	2018-04-16 00:18:34 +0200	[diff] [blame]	67
				68	foreach_avx2_vec256i foreach_avx2_vec256u
				69	#undef _
Damjan Marion	a52e166	2018-05-19 00:04:23 +0200	[diff] [blame]	70	/* INDENT-ON */
				71
				72	always_inline u32x8
Damjan Marion	c576622	2018-04-16 00:18:34 +0200	[diff] [blame]	73	u32x8_permute (u32x8 v, u32x8 idx)
				74	{
				75	return (u32x8) _mm256_permutevar8x32_epi32 ((__m256i) v, (__m256i) idx);
				76	}
				77
Damjan Marion	ef0bac7	2021-04-22 18:08:28 +0200	[diff] [blame^]	78	#define u64x4_permute(v, m0, m1, m2, m3) \
				79	(u64x4) _mm256_permute4x64_epi64 ( \
				80	(__m256i) v, ((m0) \| (m1) << 2 \| (m2) << 4 \| (m3) << 6))
				81
Damjan Marion	1cf9a16	2018-05-23 20:21:51 +0200	[diff] [blame]	82	/* _extract_lo, _extract_hi */
				83	/* INDENT-OFF */
				84	#define _(t1,t2) \
				85	always_inline t1 \
				86	t2##_extract_lo (t2 v) \
				87	{ return (t1) _mm256_extracti128_si256 ((__m256i) v, 0); } \
				88	\
				89	always_inline t1 \
				90	t2##_extract_hi (t2 v) \
				91	{ return (t1) _mm256_extracti128_si256 ((__m256i) v, 1); } \
				92	\
				93	always_inline t2 \
				94	t2##_insert_lo (t2 v1, t1 v2) \
				95	{ return (t2) _mm256_inserti128_si256 ((__m256i) v1, (__m128i) v2, 0); }\
				96	\
				97	always_inline t2 \
				98	t2##_insert_hi (t2 v1, t1 v2) \
				99	{ return (t2) _mm256_inserti128_si256 ((__m256i) v1, (__m128i) v2, 1); }\
Damjan Marion	c576622	2018-04-16 00:18:34 +0200	[diff] [blame]	100
Damjan Marion	1cf9a16	2018-05-23 20:21:51 +0200	[diff] [blame]	101	_(u8x16, u8x32)
				102	_(u16x8, u16x16)
				103	_(u32x4, u32x8)
				104	_(u64x2, u64x4)
				105	#undef _
				106	/* INDENT-ON */
Damjan Marion	c576622	2018-04-16 00:18:34 +0200	[diff] [blame]	107
Damjan Marion	ef0bac7	2021-04-22 18:08:28 +0200	[diff] [blame^]	108	always_inline u8x32
				109	u16x16_pack (u16x16 lo, u16x16 hi)
				110	{
				111	return (u8x32) _mm256_packus_epi16 ((__m256i) lo, (__m256i) hi);
				112	}
Damjan Marion	ee7f0bd	2018-05-05 12:30:28 +0200	[diff] [blame]	113
Damjan Marion	ef0bac7	2021-04-22 18:08:28 +0200	[diff] [blame^]	114	always_inline i8x32
				115	i16x16_pack (i16x16 lo, i16x16 hi)
				116	{
				117	return (i8x32) _mm256_packs_epi16 ((__m256i) lo, (__m256i) hi);
				118	}
Damjan Marion	ee7f0bd	2018-05-05 12:30:28 +0200	[diff] [blame]	119
Damjan Marion	8c3f8a2	2018-05-17 21:12:13 +0200	[diff] [blame]	120	static_always_inline u32
				121	u8x32_msb_mask (u8x32 v)
				122	{
				123	return _mm256_movemask_epi8 ((__m256i) v);
				124	}
				125
Damjan Marion	ef0bac7	2021-04-22 18:08:28 +0200	[diff] [blame^]	126	static_always_inline u32
				127	i8x32_msb_mask (i8x32 v)
				128	{
				129	return _mm256_movemask_epi8 ((__m256i) v);
				130	}
				131
Damjan Marion	90d05bc	2020-08-31 17:18:26 +0200	[diff] [blame]	132	/* _from_ */
Damjan Marion	afe56de	2018-05-17 12:44:00 +0200	[diff] [blame]	133	/* INDENT-OFF */
				134	#define _(f,t,i) \
				135	static_always_inline t \
Damjan Marion	90d05bc	2020-08-31 17:18:26 +0200	[diff] [blame]	136	t##_from_##f (f x) \
Damjan Marion	afe56de	2018-05-17 12:44:00 +0200	[diff] [blame]	137	{ return (t) _mm256_cvt##i ((__m128i) x); }
				138
				139	_(u16x8, u32x8, epu16_epi32)
				140	_(u16x8, u64x4, epu16_epi64)
				141	_(u32x4, u64x4, epu32_epi64)
Lijian.Zhang	7e9d5ff	2021-04-14 16:12:28 +0800	[diff] [blame]	142	_ (u8x16, u16x16, epu8_epi16)
Damjan Marion	afe56de	2018-05-17 12:44:00 +0200	[diff] [blame]	143	_(u8x16, u32x8, epu8_epi32)
				144	_(u8x16, u64x4, epu8_epi64)
				145	_(i16x8, i32x8, epi16_epi32)
				146	_(i16x8, i64x4, epi16_epi64)
				147	_(i32x4, i64x4, epi32_epi64)
Lijian.Zhang	7e9d5ff	2021-04-14 16:12:28 +0800	[diff] [blame]	148	_ (i8x16, i16x16, epi8_epi16)
Damjan Marion	afe56de	2018-05-17 12:44:00 +0200	[diff] [blame]	149	_(i8x16, i32x8, epi8_epi32)
				150	_(i8x16, i64x4, epi8_epi64)
				151	#undef _
				152	/* INDENT-ON */
				153
Damjan Marion	dd648aa	2020-03-12 11:56:00 +0100	[diff] [blame]	154	static_always_inline u64x4
				155	u64x4_byte_swap (u64x4 v)
				156	{
				157	u8x32 swap = {
				158	7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
				159	7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
				160	};
				161	return (u64x4) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
				162	}
				163
Damjan Marion	c899dac	2019-04-16 18:41:01 +0200	[diff] [blame]	164	static_always_inline u32x8
				165	u32x8_byte_swap (u32x8 v)
				166	{
				167	u8x32 swap = {
				168	3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
				169	3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
				170	};
				171	return (u32x8) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
				172	}
				173
Damjan Marion	bf129f4	2018-06-27 13:03:26 +0200	[diff] [blame]	174	static_always_inline u16x16
				175	u16x16_byte_swap (u16x16 v)
				176	{
				177	u8x32 swap = {
				178	1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
				179	1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
				180	};
				181	return (u16x16) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) swap);
				182	}
				183
Damjan Marion	94dbf95	2020-07-15 20:18:39 +0200	[diff] [blame]	184	static_always_inline u8x32
				185	u8x32_shuffle (u8x32 v, u8x32 m)
				186	{
				187	return (u8x32) _mm256_shuffle_epi8 ((__m256i) v, (__m256i) m);
				188	}
				189
				190	#define u8x32_align_right(a, b, imm) \
				191	(u8x32) _mm256_alignr_epi8 ((__m256i) a, (__m256i) b, imm)
				192
				193	static_always_inline u32
				194	u32x8_sum_elts (u32x8 sum8)
				195	{
				196	sum8 += (u32x8) u8x32_align_right (sum8, sum8, 8);
				197	sum8 += (u32x8) u8x32_align_right (sum8, sum8, 4);
				198	return sum8[0] + sum8[4];
				199	}
				200
Damjan Marion	bf129f4	2018-06-27 13:03:26 +0200	[diff] [blame]	201	static_always_inline u32x8
				202	u32x8_hadd (u32x8 v1, u32x8 v2)
				203	{
				204	return (u32x8) _mm256_hadd_epi32 ((__m256i) v1, (__m256i) v2);
				205	}
				206
Damjan Marion	08bca80	2018-06-18 22:21:40 +0200	[diff] [blame]	207	static_always_inline u16x16
				208	u16x16_mask_last (u16x16 v, u8 n_last)
				209	{
				210	const u16x16 masks[17] = {
				211	{0},
				212	{-1},
				213	{-1, -1},
				214	{-1, -1, -1},
				215	{-1, -1, -1, -1},
				216	{-1, -1, -1, -1, -1},
				217	{-1, -1, -1, -1, -1, -1},
				218	{-1, -1, -1, -1, -1, -1, -1},
				219	{-1, -1, -1, -1, -1, -1, -1, -1},
				220	{-1, -1, -1, -1, -1, -1, -1, -1, -1},
				221	{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
				222	{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
				223	{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
				224	{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
				225	{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
				226	{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
				227	{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
				228	};
				229
				230	ASSERT (n_last < 17);
				231
				232	return v & masks[16 - n_last];
				233	}
				234
Damjan Marion	69fdfee	2018-10-06 14:33:18 +0200	[diff] [blame]	235	static_always_inline f32x8
				236	f32x8_from_u32x8 (u32x8 v)
				237	{
				238	return (f32x8) _mm256_cvtepi32_ps ((__m256i) v);
				239	}
				240
				241	static_always_inline u32x8
				242	u32x8_from_f32x8 (f32x8 v)
				243	{
				244	return (u32x8) _mm256_cvttps_epi32 ((__m256) v);
				245	}
				246
Damjan Marion	c899dac	2019-04-16 18:41:01 +0200	[diff] [blame]	247	#define u32x8_blend(a,b,m) \
				248	(u32x8) _mm256_blend_epi32 ((__m256i) a, (__m256i) b, m)
				249
Damjan Marion	0724357	2018-11-20 10:06:57 +0100	[diff] [blame]	250	#define u16x16_blend(v1, v2, mask) \
				251	(u16x16) _mm256_blend_epi16 ((__m256i) (v1), (__m256i) (v2), mask)
				252
				253	static_always_inline u64x4
				254	u64x4_gather (void p0, void p1, void p2, void p3)
				255	{
				256	u64x4 r = {
				257	(u64 ) p0, (u64 ) p1, (u64 ) p2, (u64 ) p3
				258	};
				259	return r;
				260	}
				261
				262	static_always_inline u32x8
				263	u32x8_gather (void p0, void p1, void p2, void p3, void p4, void p5,
				264	void p6, void p7)
				265	{
				266	u32x8 r = {
				267	(u32 ) p0, (u32 ) p1, (u32 ) p2, (u32 ) p3,
				268	(u32 ) p4, (u32 ) p5, (u32 ) p6, (u32 ) p7,
				269	};
				270	return r;
				271	}
				272
				273
				274	static_always_inline void
				275	u64x4_scatter (u64x4 r, void p0, void p1, void p2, void p3)
				276	{
				277	(u64 ) p0 = r[0];
				278	(u64 ) p1 = r[1];
				279	(u64 ) p2 = r[2];
				280	(u64 ) p3 = r[3];
				281	}
				282
				283	static_always_inline void
				284	u32x8_scatter (u32x8 r, void p0, void p1, void p2, void p3, void *p4,
				285	void p5, void p6, void *p7)
				286	{
				287	(u32 ) p0 = r[0];
				288	(u32 ) p1 = r[1];
				289	(u32 ) p2 = r[2];
				290	(u32 ) p3 = r[3];
				291	(u32 ) p4 = r[4];
				292	(u32 ) p5 = r[5];
				293	(u32 ) p6 = r[6];
				294	(u32 ) p7 = r[7];
				295	}
				296
				297	static_always_inline void
				298	u64x4_scatter_one (u64x4 r, int index, void *p)
				299	{
				300	(u64 ) p = r[index];
				301	}
				302
				303	static_always_inline void
				304	u32x8_scatter_one (u32x8 r, int index, void *p)
				305	{
				306	(u32 ) p = r[index];
				307	}
				308
Damjan Marion	c59b9a2	2019-03-19 15:38:40 +0100	[diff] [blame]	309	static_always_inline u8x32
				310	u8x32_is_greater (u8x32 v1, u8x32 v2)
				311	{
				312	return (u8x32) _mm256_cmpgt_epi8 ((__m256i) v1, (__m256i) v2);
				313	}
				314
				315	static_always_inline u8x32
				316	u8x32_blend (u8x32 v1, u8x32 v2, u8x32 mask)
				317	{
				318	return (u8x32) _mm256_blendv_epi8 ((__m256i) v1, (__m256i) v2,
				319	(__m256i) mask);
				320	}
				321
Damjan Marion	9f7e33d	2019-04-08 10:14:51 +0200	[diff] [blame]	322	#define u32x8_permute_lanes(a, b, m) \
				323	(u32x8) _mm256_permute2x128_si256 ((__m256i) a, (__m256i) b, m)
				324	#define u64x4_permute_lanes(a, b, m) \
				325	(u64x4) _mm256_permute2x128_si256 ((__m256i) a, (__m256i) b, m)
				326
Damjan Marion	c899dac	2019-04-16 18:41:01 +0200	[diff] [blame]	327	static_always_inline u32x8
				328	u32x8_min (u32x8 a, u32x8 b)
				329	{
				330	return (u32x8) _mm256_min_epu32 ((__m256i) a, (__m256i) b);
				331	}
				332
				333	static_always_inline u32
				334	u32x8_min_scalar (u32x8 v)
				335	{
				336	return u32x4_min_scalar (u32x4_min (u32x8_extract_lo (v),
				337	u32x8_extract_hi (v)));
				338	}
				339
Damjan Marion	9f7e33d	2019-04-08 10:14:51 +0200	[diff] [blame]	340	static_always_inline void
				341	u32x8_transpose (u32x8 a[8])
				342	{
				343	u64x4 r[8], x, y;
				344
				345	r[0] = (u64x4) u32x8_interleave_lo (a[0], a[1]);
				346	r[1] = (u64x4) u32x8_interleave_hi (a[0], a[1]);
				347	r[2] = (u64x4) u32x8_interleave_lo (a[2], a[3]);
				348	r[3] = (u64x4) u32x8_interleave_hi (a[2], a[3]);
				349	r[4] = (u64x4) u32x8_interleave_lo (a[4], a[5]);
				350	r[5] = (u64x4) u32x8_interleave_hi (a[4], a[5]);
				351	r[6] = (u64x4) u32x8_interleave_lo (a[6], a[7]);
				352	r[7] = (u64x4) u32x8_interleave_hi (a[6], a[7]);
				353
				354	x = u64x4_interleave_lo (r[0], r[2]);
				355	y = u64x4_interleave_lo (r[4], r[6]);
				356	a[0] = u32x8_permute_lanes (x, y, 0x20);
				357	a[4] = u32x8_permute_lanes (x, y, 0x31);
				358
				359	x = u64x4_interleave_hi (r[0], r[2]);
				360	y = u64x4_interleave_hi (r[4], r[6]);
				361	a[1] = u32x8_permute_lanes (x, y, 0x20);
				362	a[5] = u32x8_permute_lanes (x, y, 0x31);
				363
				364	x = u64x4_interleave_lo (r[1], r[3]);
				365	y = u64x4_interleave_lo (r[5], r[7]);
				366	a[2] = u32x8_permute_lanes (x, y, 0x20);
				367	a[6] = u32x8_permute_lanes (x, y, 0x31);
				368
				369	x = u64x4_interleave_hi (r[1], r[3]);
				370	y = u64x4_interleave_hi (r[5], r[7]);
				371	a[3] = u32x8_permute_lanes (x, y, 0x20);
				372	a[7] = u32x8_permute_lanes (x, y, 0x31);
				373	}
				374
				375	static_always_inline void
				376	u64x4_transpose (u64x4 a[8])
				377	{
				378	u64x4 r[4];
				379
				380	r[0] = u64x4_interleave_lo (a[0], a[1]);
				381	r[1] = u64x4_interleave_hi (a[0], a[1]);
				382	r[2] = u64x4_interleave_lo (a[2], a[3]);
				383	r[3] = u64x4_interleave_hi (a[2], a[3]);
				384
				385	a[0] = u64x4_permute_lanes (r[0], r[2], 0x20);
				386	a[1] = u64x4_permute_lanes (r[1], r[3], 0x20);
				387	a[2] = u64x4_permute_lanes (r[0], r[2], 0x31);
				388	a[3] = u64x4_permute_lanes (r[1], r[3], 0x31);
				389	}
				390
Damjan Marion	c576622	2018-04-16 00:18:34 +0200	[diff] [blame]	391	#endif /* included_vector_avx2_h */
				392
				393	/*
				394	* fd.io coding-style-patch-verification: ON
				395	*
				396	* Local Variables:
				397	* eval: (c-set-style "gnu")
				398	* End:
				399	*/