Blame - src/vppinfra/vector_neon.h - fdio/vpp

blob: 3ed783602969bc410cc50ab0da208377cdd220ed [file] [log] [blame]

Christophe Fontaine	33e8195	2016-12-19 14:41:52 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2015 Cisco and/or its affiliates.
				3	* Licensed under the Apache License, Version 2.0 (the "License");
				4	* you may not use this file except in compliance with the License.
				5	* You may obtain a copy of the License at:
				6	*
				7	* http://www.apache.org/licenses/LICENSE-2.0
				8	*
				9	* Unless required by applicable law or agreed to in writing, software
				10	* distributed under the License is distributed on an "AS IS" BASIS,
				11	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	* See the License for the specific language governing permissions and
				13	* limitations under the License.
				14	*/
				15
				16	#ifndef included_vector_neon_h
				17	#define included_vector_neon_h
				18	#include <arm_neon.h>
				19
Christophe Fontaine	33e8195	2016-12-19 14:41:52 +0100	[diff] [blame]	20	/* Arithmetic */
Christophe Fontaine	33e8195	2016-12-19 14:41:52 +0100	[diff] [blame]	21	#define u16x8_sub_saturate(a,b) vsubq_u16(a,b)
				22	#define i16x8_sub_saturate(a,b) vsubq_s16(a,b)
				23
Adrian Oanca	2b53e4e	2018-02-20 17:14:58 +0100	[diff] [blame]	24	always_inline int
				25	u8x16_is_all_zero (u8x16 x)
				26	{
				27	return !(vaddvq_u8 (x));
				28	}
				29
				30	always_inline int
				31	u16x8_is_all_zero (u16x8 x)
				32	{
				33	return !(vaddvq_u16 (x));
				34	}
				35
				36	always_inline int
				37	u32x4_is_all_zero (u32x4 x)
				38	{
				39	return !(vaddvq_u32 (x));
				40	}
				41
				42	always_inline int
				43	u64x2_is_all_zero (u64x2 x)
				44	{
				45	return !(vaddvq_u64 (x));
				46	}
				47
Gabriel Ganne	b81831d	2017-12-05 17:33:37 +0100	[diff] [blame]	48	/* Converts all ones/zeros compare mask to bitmap. */
				49	always_inline u32
				50	u8x16_compare_byte_mask (u8x16 x)
				51	{
Adrian Oanca	0b8792f	2018-02-15 15:44:51 +0100	[diff] [blame]	52	uint8x16_t mask_shift =
				53	{ -7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0 };
				54	uint8x16_t mask_and = vdupq_n_u8 (0x80);
				55	x = vandq_u8 (x, mask_and);
				56	x = vshlq_u8 (x, vreinterpretq_s8_u8 (mask_shift));
				57	x = vpaddq_u8 (x, x);
				58	x = vpaddq_u8 (x, x);
				59	x = vpaddq_u8 (x, x);
				60	return vgetq_lane_u8 (x, 0) \| (vgetq_lane_u8 (x, 1) << 8);
Gabriel Ganne	b81831d	2017-12-05 17:33:37 +0100	[diff] [blame]	61	}
Christophe Fontaine	33e8195	2016-12-19 14:41:52 +0100	[diff] [blame]	62
				63	always_inline u32
				64	u16x8_zero_byte_mask (u16x8 input)
				65	{
				66	u8x16 vall_one = vdupq_n_u8 (0x0);
				67	u8x16 res_values = { 0x01, 0x02, 0x04, 0x08,
				68	0x10, 0x20, 0x40, 0x80,
				69	0x01, 0x02, 0x04, 0x08,
				70	0x10, 0x20, 0x40, 0x80
				71	};
				72
				73	/* input --> [0x80, 0x40, 0x01, 0xf0, ... ] */
				74	u8x16 test_result =
				75	vreinterpretq_u8_u16 (vceqq_u16 (input, vreinterpretq_u16_u8 (vall_one)));
				76	u8x16 before_merge = vminq_u8 (test_result, res_values);
				77	/before_merge--> [0x80, 0x00, 0x00, 0x10, ... ] /
				78	/* u8x16 --> [a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p] */
				79	/* pair add until we have 2 uint64_t */
				80	u16x8 merge1 = vpaddlq_u8 (before_merge);
				81	/* u16x8--> [a+b,c+d, e+f,g+h, i+j,k+l, m+n,o+p] */
				82	u32x4 merge2 = vpaddlq_u16 (merge1);
				83	/* u32x4--> [a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p] */
				84	u64x2 merge3 = vpaddlq_u32 (merge2);
				85	/* u64x2--> [a+b+c+d+e+f+g+h, i+j+k+l+m+n+o+p] */
				86	return (u32) (vgetq_lane_u64 (merge3, 1) << 8) + vgetq_lane_u64 (merge3, 0);
				87	}
				88
Adrian Oanca	22ac59b	2018-02-23 16:27:41 +0100	[diff] [blame]	89	always_inline u32
				90	u8x16_zero_byte_mask (u8x16 input)
				91	{
				92	return u16x8_zero_byte_mask ((u16x8) input);
				93	}
				94
				95	always_inline u32
				96	u32x4_zero_byte_mask (u32x4 input)
				97	{
				98	return u16x8_zero_byte_mask ((u16x8) input);
				99	}
				100
				101	always_inline u32
				102	u64x2_zero_byte_mask (u64x2 input)
				103	{
				104	return u16x8_zero_byte_mask ((u16x8) input);
				105	}
				106
				107
				108
Christophe Fontaine	33e8195	2016-12-19 14:41:52 +0100	[diff] [blame]	109	#endif /* included_vector_neon_h */
				110
				111	/*
				112	* fd.io coding-style-patch-verification: ON
				113	*
				114	* Local Variables:
				115	* eval: (c-set-style "gnu")
				116	* End:
				117	*/