ip: vectorized ip checksum

Change-Id: Ida678e6f31daa8decb18189da712a350336326e2
Signed-off-by: Damjan Marion <damarion@cisco.com>
diff --git a/src/vnet/ip/ip_packet.h b/src/vnet/ip/ip_packet.h
index 6c86e3e..c499097 100644
--- a/src/vnet/ip/ip_packet.h
+++ b/src/vnet/ip/ip_packet.h
@@ -86,6 +86,97 @@
 
 /* IP checksum support. */
 
+static_always_inline u16
+ip_csum (void *data, u16 n_left)
+{
+  u32 sum;
+#ifdef CLIB_HAVE_VEC256
+  u16x16 v1, v2;
+  u32x8 zero = { 0 };
+  u32x8 sum8 = { 0 };
+  u32x4 sum4;
+#endif
+
+  /* if there is odd number of bytes, pad by zero and store in sum */
+  sum = (n_left & 1) ? ((u8 *) data)[n_left - 1] << 8 : 0;
+
+  /* we deal with words */
+  n_left >>= 1;
+
+#ifdef CLIB_HAVE_VEC256
+  while (n_left >= 32)
+    {
+      v1 = u16x16_load_unaligned (data);
+      v2 = u16x16_load_unaligned (data + 32);
+
+#ifdef CLIB_ARCH_IS_LITTLE_ENDIAN
+      v1 = u16x16_byte_swap (v1);
+      v2 = u16x16_byte_swap (v2);
+#endif
+      sum8 += u16x8_extend_to_u32x8 (u16x16_extract_lo (v1));
+      sum8 += u16x8_extend_to_u32x8 (u16x16_extract_hi (v1));
+      sum8 += u16x8_extend_to_u32x8 (u16x16_extract_lo (v2));
+      sum8 += u16x8_extend_to_u32x8 (u16x16_extract_hi (v2));
+      n_left -= 32;
+      data += 64;
+    }
+
+  if (n_left >= 16)
+    {
+      v1 = u16x16_load_unaligned (data);
+#ifdef CLIB_ARCH_IS_LITTLE_ENDIAN
+      v1 = u16x16_byte_swap (v1);
+#endif
+      v1 = u16x16_byte_swap (u16x16_load_unaligned (data));
+      sum8 += u16x8_extend_to_u32x8 (u16x16_extract_lo (v1));
+      sum8 += u16x8_extend_to_u32x8 (u16x16_extract_hi (v1));
+      n_left -= 16;
+      data += 32;
+    }
+
+  if (n_left)
+    {
+      v1 = u16x16_load_unaligned (data);
+#ifdef CLIB_ARCH_IS_LITTLE_ENDIAN
+      v1 = u16x16_byte_swap (v1);
+#endif
+      v1 = u16x16_mask_last (v1, 16 - n_left);
+      sum8 += u16x8_extend_to_u32x8 (u16x16_extract_lo (v1));
+      sum8 += u16x8_extend_to_u32x8 (u16x16_extract_hi (v1));
+    }
+
+  sum8 = u32x8_hadd (sum8, zero);
+  sum4 = u32x8_extract_lo (sum8) + u32x8_extract_hi (sum8);
+  sum = sum4[0] + sum4[1];
+
+#else
+  /* scalar version */
+  while (n_left >= 8)
+    {
+      sum += clib_net_to_host_u16 (*((u16 *) data + 0));
+      sum += clib_net_to_host_u16 (*((u16 *) data + 1));
+      sum += clib_net_to_host_u16 (*((u16 *) data + 2));
+      sum += clib_net_to_host_u16 (*((u16 *) data + 3));
+      sum += clib_net_to_host_u16 (*((u16 *) data + 4));
+      sum += clib_net_to_host_u16 (*((u16 *) data + 5));
+      sum += clib_net_to_host_u16 (*((u16 *) data + 6));
+      sum += clib_net_to_host_u16 (*((u16 *) data + 7));
+      n_left -= 8;
+      data += 16;
+    }
+  while (n_left)
+    {
+      sum += clib_net_to_host_u16 (*(u16 *) data);
+      n_left -= 1;
+      data += 2;
+    }
+#endif
+
+  sum = (sum & 0xffff) + (sum >> 16);
+  sum = (sum & 0xffff) + (sum >> 16);
+  return ~((u16) sum);
+}
+
 /* Incremental checksum update. */
 typedef uword ip_csum_t;