Blame - src/vppinfra/vector_neon.h - fdio/vpp

blob: 6ca9c0b90a0564515aa8fc7eed8f779fecb3f0d0 [file] [log] [blame]

Christophe Fontaine	33e8195	2016-12-19 14:41:52 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2015 Cisco and/or its affiliates.
				3	* Licensed under the Apache License, Version 2.0 (the "License");
				4	* you may not use this file except in compliance with the License.
				5	* You may obtain a copy of the License at:
				6	*
				7	* http://www.apache.org/licenses/LICENSE-2.0
				8	*
				9	* Unless required by applicable law or agreed to in writing, software
				10	* distributed under the License is distributed on an "AS IS" BASIS,
				11	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	* See the License for the specific language governing permissions and
				13	* limitations under the License.
				14	*/
				15
				16	#ifndef included_vector_neon_h
				17	#define included_vector_neon_h
				18	#include <arm_neon.h>
				19
Christophe Fontaine	33e8195	2016-12-19 14:41:52 +0100	[diff] [blame]	20	/* Arithmetic */
Christophe Fontaine	33e8195	2016-12-19 14:41:52 +0100	[diff] [blame]	21	#define u16x8_sub_saturate(a,b) vsubq_u16(a,b)
				22	#define i16x8_sub_saturate(a,b) vsubq_s16(a,b)
				23
Adrian Oanca	2b53e4e	2018-02-20 17:14:58 +0100	[diff] [blame]	24	always_inline int
				25	u8x16_is_all_zero (u8x16 x)
				26	{
				27	return !(vaddvq_u8 (x));
				28	}
				29
				30	always_inline int
				31	u16x8_is_all_zero (u16x8 x)
				32	{
				33	return !(vaddvq_u16 (x));
				34	}
				35
				36	always_inline int
				37	u32x4_is_all_zero (u32x4 x)
				38	{
				39	return !(vaddvq_u32 (x));
				40	}
				41
				42	always_inline int
				43	u64x2_is_all_zero (u64x2 x)
				44	{
				45	return !(vaddvq_u64 (x));
				46	}
				47
Gabriel Ganne	b81831d	2017-12-05 17:33:37 +0100	[diff] [blame]	48	/* Converts all ones/zeros compare mask to bitmap. */
				49	always_inline u32
				50	u8x16_compare_byte_mask (u8x16 x)
				51	{
				52	static int8_t const __attribute__ ((aligned (16))) xr[8] =
				53	{
				54	-7, -6, -5, -4, -3, -2, -1, 0};
				55	uint8x8_t mask_and = vdup_n_u8 (0x80);
				56	int8x8_t mask_shift = vld1_s8 (xr);
				57
				58	uint8x8_t lo = vget_low_u8 (x);
				59	uint8x8_t hi = vget_high_u8 (x);
				60
				61	lo = vand_u8 (lo, mask_and);
				62	lo = vshl_u8 (lo, mask_shift);
				63
				64	hi = vand_u8 (hi, mask_and);
				65	hi = vshl_u8 (hi, mask_shift);
				66
				67	lo = vpadd_u8 (lo, lo);
				68	lo = vpadd_u8 (lo, lo);
				69	lo = vpadd_u8 (lo, lo);
				70
				71	hi = vpadd_u8 (hi, hi);
				72	hi = vpadd_u8 (hi, hi);
				73	hi = vpadd_u8 (hi, hi);
				74
				75	return ((hi[0] << 8) \| (lo[0] & 0xff));
				76	}
Christophe Fontaine	33e8195	2016-12-19 14:41:52 +0100	[diff] [blame]	77
				78	always_inline u32
				79	u16x8_zero_byte_mask (u16x8 input)
				80	{
				81	u8x16 vall_one = vdupq_n_u8 (0x0);
				82	u8x16 res_values = { 0x01, 0x02, 0x04, 0x08,
				83	0x10, 0x20, 0x40, 0x80,
				84	0x01, 0x02, 0x04, 0x08,
				85	0x10, 0x20, 0x40, 0x80
				86	};
				87
				88	/* input --> [0x80, 0x40, 0x01, 0xf0, ... ] */
				89	u8x16 test_result =
				90	vreinterpretq_u8_u16 (vceqq_u16 (input, vreinterpretq_u16_u8 (vall_one)));
				91	u8x16 before_merge = vminq_u8 (test_result, res_values);
				92	/before_merge--> [0x80, 0x00, 0x00, 0x10, ... ] /
				93	/* u8x16 --> [a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p] */
				94	/* pair add until we have 2 uint64_t */
				95	u16x8 merge1 = vpaddlq_u8 (before_merge);
				96	/* u16x8--> [a+b,c+d, e+f,g+h, i+j,k+l, m+n,o+p] */
				97	u32x4 merge2 = vpaddlq_u16 (merge1);
				98	/* u32x4--> [a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p] */
				99	u64x2 merge3 = vpaddlq_u32 (merge2);
				100	/* u64x2--> [a+b+c+d+e+f+g+h, i+j+k+l+m+n+o+p] */
				101	return (u32) (vgetq_lane_u64 (merge3, 1) << 8) + vgetq_lane_u64 (merge3, 0);
				102	}
				103
				104	#endif /* included_vector_neon_h */
				105
				106	/*
				107	* fd.io coding-style-patch-verification: ON
				108	*
				109	* Local Variables:
				110	* eval: (c-set-style "gnu")
				111	* End:
				112	*/