vhost: Added ARMV8 NEON version of function map_guest_mem()
(VPP-1085)

The NEON implementation searches particular address in
VHOST_MEMORY_MAX_NREGIONS regions. Searching two regions at a
time.

Change-Id: Icc3c6746bc98e3a1fa71424e51b64f62efbfdc74
Signed-off-by: Nitin Saxena <nitin.saxena@cavium.com>
diff --git a/src/vnet/devices/virtio/vhost-user.c b/src/vnet/devices/virtio/vhost-user.c
index 9c93ef4..be34054 100644
--- a/src/vnet/devices/virtio/vhost-user.c
+++ b/src/vnet/devices/virtio/vhost-user.c
@@ -256,7 +256,76 @@
       return (void *) (vui->region_mmap_addr[i] + addr -
 		       vui->regions[i].guest_phys_addr);
     }
+#elif __aarch64__ && __ARM_NEON
+  uint64x2_t al, ah, rl, rh, r;
+  uint32_t u32 = 0;
 
+  al = vdupq_n_u64 (addr + 1);
+  ah = vdupq_n_u64 (addr);
+
+  /*First Iteration */
+  rl = vld1q_u64 (&vui->region_guest_addr_lo[0]);
+  rl = vcgtq_u64 (al, rl);
+  rh = vld1q_u64 (&vui->region_guest_addr_hi[0]);
+  rh = vcgtq_u64 (rh, ah);
+  r = vandq_u64 (rl, rh);
+  u32 |= (vgetq_lane_u8 (vreinterpretq_u8_u64 (r), 0) & 0x1);
+  u32 |= ((vgetq_lane_u8 (vreinterpretq_u8_u64 (r), 8) & 0x1) << 1);
+
+  if (u32)
+    {
+      i = __builtin_ctzll (u32);
+      goto vhost_map_guest_mem_done;
+    }
+
+  /*Second Iteration */
+  rl = vld1q_u64 (&vui->region_guest_addr_lo[2]);
+  rl = vcgtq_u64 (al, rl);
+  rh = vld1q_u64 (&vui->region_guest_addr_hi[2]);
+  rh = vcgtq_u64 (rh, ah);
+  r = vandq_u64 (rl, rh);
+  u32 |= ((vgetq_lane_u8 (vreinterpretq_u8_u64 (r), 0) & 0x1) << 2);
+  u32 |= ((vgetq_lane_u8 (vreinterpretq_u8_u64 (r), 8) & 0x1) << 3);
+
+  if (u32)
+    {
+      i = __builtin_ctzll (u32);
+      goto vhost_map_guest_mem_done;
+    }
+
+  /*Third Iteration */
+  rl = vld1q_u64 (&vui->region_guest_addr_lo[4]);
+  rl = vcgtq_u64 (al, rl);
+  rh = vld1q_u64 (&vui->region_guest_addr_hi[4]);
+  rh = vcgtq_u64 (rh, ah);
+  r = vandq_u64 (rl, rh);
+  u32 |= ((vgetq_lane_u8 (vreinterpretq_u8_u64 (r), 0) & 0x1) << 4);
+  u32 |= ((vgetq_lane_u8 (vreinterpretq_u8_u64 (r), 8) & 0x1) << 5);
+
+  if (u32)
+    {
+      i = __builtin_ctzll (u32);
+      goto vhost_map_guest_mem_done;
+    }
+
+  /*Fourth Iteration */
+  rl = vld1q_u64 (&vui->region_guest_addr_lo[6]);
+  rl = vcgtq_u64 (al, rl);
+  rh = vld1q_u64 (&vui->region_guest_addr_hi[6]);
+  rh = vcgtq_u64 (rh, ah);
+  r = vandq_u64 (rl, rh);
+  u32 |= ((vgetq_lane_u8 (vreinterpretq_u8_u64 (r), 0) & 0x1) << 6);
+  u32 |= ((vgetq_lane_u8 (vreinterpretq_u8_u64 (r), 8) & 0x1) << 7);
+
+  i = __builtin_ctzll (u32 | (1 << VHOST_MEMORY_MAX_NREGIONS));
+
+vhost_map_guest_mem_done:
+  if (i < vui->nregions)
+    {
+      *hint = i;
+      return (void *) (vui->region_mmap_addr[i] + addr -
+		       vui->regions[i].guest_phys_addr);
+    }
 #else
   for (i = 0; i < vui->nregions; i++)
     {