vppinfra: native poly1305 implementation

Type: feature
Signed-off-by: Damjan Marion <damarion@cisco.com>
Change-Id: Ic170464d7c63f243e7e676567d41d800647ebec3
diff --git a/src/vppinfra/CMakeLists.txt b/src/vppinfra/CMakeLists.txt
index ad942a2..46b2788 100644
--- a/src/vppinfra/CMakeLists.txt
+++ b/src/vppinfra/CMakeLists.txt
@@ -133,6 +133,7 @@
   crypto/aes.h
   crypto/aes_cbc.h
   crypto/aes_gcm.h
+  crypto/poly1305.h
   dlist.h
   dlmalloc.h
   elf_clib.h
@@ -284,6 +285,7 @@
 set(test_files
   test/aes_cbc.c
   test/aes_gcm.c
+  test/poly1305.c
   test/array_mask.c
   test/compress.c
   test/count_equal.c
diff --git a/src/vppinfra/clib.h b/src/vppinfra/clib.h
index b8257bf..dab7eeb 100644
--- a/src/vppinfra/clib.h
+++ b/src/vppinfra/clib.h
@@ -339,6 +339,44 @@
   _x < 0 ? -_x : _x;				\
 })
 
+static_always_inline u64
+u64_add_with_carry (u64 *carry, u64 a, u64 b)
+{
+#if defined(__x86_64__)
+  unsigned long long v;
+  *carry = _addcarry_u64 (*carry, a, b, &v);
+  return (u64) v;
+#elif defined(__clang__)
+  unsigned long long c;
+  u64 rv = __builtin_addcll (a, b, *carry, &c);
+  *carry = c;
+  return rv;
+#else
+  u64 rv = a + b + *carry;
+  *carry = rv < a;
+  return rv;
+#endif
+}
+
+static_always_inline u64
+u64_sub_with_borrow (u64 *borrow, u64 x, u64 y)
+{
+#if defined(__x86_64__)
+  unsigned long long v;
+  *borrow = _subborrow_u64 (*borrow, x, y, &v);
+  return (u64) v;
+#elif defined(__clang__)
+  unsigned long long b;
+  u64 rv = __builtin_subcll (x, y, *borrow, &b);
+  *borrow = b;
+  return rv;
+#else
+  unsigned long long rv = x - (y + *borrow);
+  *borrow = rv >= x;
+  return rv;
+#endif
+}
+
 /* Standard standalone-only function declarations. */
 #ifndef CLIB_UNIX
 void clib_standalone_init (void *memory, uword memory_bytes);
diff --git a/src/vppinfra/crypto/poly1305.h b/src/vppinfra/crypto/poly1305.h
new file mode 100644
index 0000000..cd6ea60
--- /dev/null
+++ b/src/vppinfra/crypto/poly1305.h
@@ -0,0 +1,234 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2023 Cisco Systems, Inc.
+ */
+
+#ifndef __clib_poly1305_h__
+#define __clib_poly1305_h__
+
+#include <vppinfra/clib.h>
+#include <vppinfra/vector.h>
+#include <vppinfra/cache.h>
+#include <vppinfra/string.h>
+
+/* implementation of DJB's poly1305 using 64-bit arithmetrics */
+
+typedef struct
+{
+  const u64 r[3], s[2];
+  u64 h[3];
+
+  /* partial data */
+  union
+  {
+    u8 as_u8[16];
+    u64 as_u64[2];
+  } partial;
+
+  size_t n_partial_bytes;
+} clib_poly1305_ctx;
+
+static_always_inline void
+clib_poly1305_init (clib_poly1305_ctx *ctx, const u8 key[32])
+{
+  u64u *k = (u64u *) key;
+  u64 *h = (u64 *) ctx->h;
+  u64 *r = (u64 *) ctx->r;
+  u64 *s = (u64 *) ctx->s;
+
+  /* initialize accumulator */
+  h[0] = h[1] = h[2] = 0;
+
+  /* clamp 1st half of the key and store it into r[] */
+  r[0] = k[0] & 0x0ffffffc0fffffff;
+  r[1] = k[1] & 0x0ffffffc0ffffffc;
+  s[0] = k[2];
+  s[1] = k[3];
+
+  /* precompute (r[1] >> 2) * 5 */
+  r[2] = r[1] + (r[1] >> 2);
+
+  ctx->n_partial_bytes = 0;
+}
+
+static_always_inline void
+_clib_poly1305_multiply_and_reduce (u64 h[3], const u64 r[3])
+{
+  union
+  {
+    struct
+    {
+      u64 lo, hi;
+    };
+    u128 n;
+  } l0, l1, l2;
+  u64 c;
+
+  /*
+		       h2       h1       h0
+    x                           r1       r0
+    ---------------------------------------
+		  r0 x h2  r0 x h1  r0 × h0
+    +    r1 x h2  r1 x h1  r1 x h0
+    ---------------------------------------
+
+    for p = 2^130-5, following applies:
+    (r * 2^130) mod p == (r * 5) mod p
+
+    bits above 130 can be shifted right (divided by 2^130)
+    and multiplied by 5 per equation above
+
+	     h2               h1                h0
+    x                         r1                r0
+    ----------------------------------------------
+       r0 x h2           r0 x h1           r0 × h0
+    +                    r1 x h0
+    +           5x (r1 >>2) x h2  5x (r1 >>2) x h1
+    ----------------------------------------------
+       [0:l2.lo]   [l1.hi:l1.lo]     [l0.hi:l0.lo]
+   */
+
+  l0.n = l1.n = l2.n = 0;
+  /* u64 x u64 = u128 multiplications */
+  l0.n += (u128) h[0] * r[0];
+  l0.n += (u128) h[1] * r[2]; /* r[2] holds precomputed (r[1] >> 2) * 5 */
+  l1.n += (u128) h[0] * r[1];
+  l1.n += (u128) h[1] * r[0];
+
+  /* u64 x u64 = u64 multiplications, as h[2] may have only lower 2 bits set
+   * and r[1] have clamped bits 60-63  */
+  l1.n += (u128) (h[2] * r[2]);
+  l2.n += (u128) (h[2] * r[0]);
+
+  /* propagate upper 64 bits to higher limb */
+  c = 0;
+  l1.lo = u64_add_with_carry (&c, l1.lo, l0.hi);
+  l2.lo = u64_add_with_carry (&c, l2.lo, l1.hi);
+
+  l2.hi = l2.lo;
+  /* keep bits [128:129] */
+  l2.lo &= 3;
+
+  /* bits 130 and above multiply with 5 and store to l2.hi */
+  l2.hi -= l2.lo;
+  l2.hi += l2.hi >> 2;
+
+  /* add l2.hi to l0.lo with carry propagation and store result to h2:h1:h0 */
+  c = 0;
+  h[0] = u64_add_with_carry (&c, l0.lo, l2.hi);
+  h[1] = u64_add_with_carry (&c, l1.lo, 0);
+  h[2] = u64_add_with_carry (&c, l2.lo, 0);
+}
+
+static_always_inline u32
+_clib_poly1305_add_blocks (clib_poly1305_ctx *ctx, const u8 *msg,
+			   uword n_bytes, const u32 bit17)
+{
+  u64 r[3], h[3];
+
+  for (int i = 0; i < 3; i++)
+    {
+      h[i] = ctx->h[i];
+      r[i] = ctx->r[i];
+    }
+
+  for (const u64u *m = (u64u *) msg; n_bytes >= 16; n_bytes -= 16, m += 2)
+    {
+      u64 c = 0;
+
+      /* h += m */
+      h[0] = u64_add_with_carry (&c, h[0], m[0]);
+      h[1] = u64_add_with_carry (&c, h[1], m[1]);
+      h[2] = u64_add_with_carry (&c, h[2], bit17 ? 1 : 0);
+
+      /* h = (h * r) mod p */
+      _clib_poly1305_multiply_and_reduce (h, r);
+    }
+
+  for (int i = 0; i < 3; i++)
+    ctx->h[i] = h[i];
+
+  return n_bytes;
+}
+
+static_always_inline void
+clib_poly1305_update (clib_poly1305_ctx *ctx, const u8 *msg, uword len)
+{
+  uword n_left = len;
+
+  if (n_left == 0)
+    return;
+
+  if (ctx->n_partial_bytes)
+    {
+      u16 missing_bytes = 16 - ctx->n_partial_bytes;
+      if (PREDICT_FALSE (n_left < missing_bytes))
+	{
+	  clib_memcpy_fast (ctx->partial.as_u8 + ctx->n_partial_bytes, msg,
+			    n_left);
+	  ctx->n_partial_bytes += n_left;
+	  return;
+	}
+
+      clib_memcpy_fast (ctx->partial.as_u8 + ctx->n_partial_bytes, msg,
+			missing_bytes);
+      _clib_poly1305_add_blocks (ctx, ctx->partial.as_u8, 16, 1);
+      ctx->n_partial_bytes = 0;
+      n_left -= missing_bytes;
+      msg += missing_bytes;
+    }
+
+  n_left = _clib_poly1305_add_blocks (ctx, msg, n_left, 1);
+
+  if (n_left)
+    {
+      ctx->partial.as_u64[0] = ctx->partial.as_u64[1] = 0;
+      clib_memcpy_fast (ctx->partial.as_u8, msg + len - n_left, n_left);
+      ctx->n_partial_bytes = n_left;
+    }
+}
+
+static_always_inline void
+clib_poly1305_final (clib_poly1305_ctx *ctx, u8 *out)
+{
+  const u64 p[] = { 0xFFFFFFFFFFFFFFFB, 0xFFFFFFFFFFFFFFFF, 3 }; /* 2^128-5 */
+  const u64 *s = ctx->s;
+  u64u *t = (u64u *) out;
+  u64 h0, h1, t0, t1;
+  u64 c;
+
+  if (ctx->n_partial_bytes)
+    {
+      ctx->partial.as_u8[ctx->n_partial_bytes] = 1;
+      _clib_poly1305_add_blocks (ctx, ctx->partial.as_u8, 16, 0);
+    }
+
+  h0 = ctx->h[0];
+  h1 = ctx->h[1];
+
+  /* h may not be fully reduced, try to subtract 2^128-5 */
+  c = 0;
+  t0 = u64_sub_with_borrow (&c, h0, p[0]);
+  t1 = u64_sub_with_borrow (&c, h1, p[1]);
+  u64_sub_with_borrow (&c, ctx->h[2], p[2]);
+
+  if (!c)
+    {
+      h0 = t0;
+      h1 = t1;
+    }
+
+  c = 0;
+  t[0] = u64_add_with_carry (&c, h0, s[0]);
+  t[1] = u64_add_with_carry (&c, h1, s[1]);
+}
+
+static_always_inline void
+clib_poly1305 (const u8 *key, const u8 *msg, uword len, u8 *out)
+{
+  clib_poly1305_ctx ctx;
+  clib_poly1305_init (&ctx, key);
+  clib_poly1305_update (&ctx, msg, len);
+  clib_poly1305_final (&ctx, out);
+}
+
+#endif /* __clib_poly1305_h__ */
diff --git a/src/vppinfra/test/poly1305.c b/src/vppinfra/test/poly1305.c
new file mode 100644
index 0000000..34551f8
--- /dev/null
+++ b/src/vppinfra/test/poly1305.c
@@ -0,0 +1,268 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2023 Cisco Systems, Inc.
+ */
+
+#include <vppinfra/format.h>
+#include <vppinfra/test/test.h>
+#include <vppinfra/crypto/poly1305.h>
+
+static const u8 text1[375] = {
+  0x41, 0x6e, 0x79, 0x20, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x73, 0x73, 0x69, 0x6f,
+  0x6e, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x49, 0x45, 0x54, 0x46,
+  0x20, 0x69, 0x6e, 0x74, 0x65, 0x6e, 0x64, 0x65, 0x64, 0x20, 0x62, 0x79, 0x20,
+  0x74, 0x68, 0x65, 0x20, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74,
+  0x6f, 0x72, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x70, 0x75, 0x62, 0x6c, 0x69, 0x63,
+  0x61, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x61, 0x73, 0x20, 0x61, 0x6c, 0x6c, 0x20,
+  0x6f, 0x72, 0x20, 0x70, 0x61, 0x72, 0x74, 0x20, 0x6f, 0x66, 0x20, 0x61, 0x6e,
+  0x20, 0x49, 0x45, 0x54, 0x46, 0x20, 0x49, 0x6e, 0x74, 0x65, 0x72, 0x6e, 0x65,
+  0x74, 0x2d, 0x44, 0x72, 0x61, 0x66, 0x74, 0x20, 0x6f, 0x72, 0x20, 0x52, 0x46,
+  0x43, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61, 0x6e, 0x79, 0x20, 0x73, 0x74, 0x61,
+  0x74, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x20, 0x6d, 0x61, 0x64, 0x65, 0x20, 0x77,
+  0x69, 0x74, 0x68, 0x69, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e,
+  0x74, 0x65, 0x78, 0x74, 0x20, 0x6f, 0x66, 0x20, 0x61, 0x6e, 0x20, 0x49, 0x45,
+  0x54, 0x46, 0x20, 0x61, 0x63, 0x74, 0x69, 0x76, 0x69, 0x74, 0x79, 0x20, 0x69,
+  0x73, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x69, 0x64, 0x65, 0x72, 0x65, 0x64, 0x20,
+  0x61, 0x6e, 0x20, 0x22, 0x49, 0x45, 0x54, 0x46, 0x20, 0x43, 0x6f, 0x6e, 0x74,
+  0x72, 0x69, 0x62, 0x75, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x2e, 0x20, 0x53, 0x75,
+  0x63, 0x68, 0x20, 0x73, 0x74, 0x61, 0x74, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x73,
+  0x20, 0x69, 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x20, 0x6f, 0x72, 0x61, 0x6c,
+  0x20, 0x73, 0x74, 0x61, 0x74, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x20, 0x69,
+  0x6e, 0x20, 0x49, 0x45, 0x54, 0x46, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f,
+  0x6e, 0x73, 0x2c, 0x20, 0x61, 0x73, 0x20, 0x77, 0x65, 0x6c, 0x6c, 0x20, 0x61,
+  0x73, 0x20, 0x77, 0x72, 0x69, 0x74, 0x74, 0x65, 0x6e, 0x20, 0x61, 0x6e, 0x64,
+  0x20, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x72, 0x6f, 0x6e, 0x69, 0x63, 0x20, 0x63,
+  0x6f, 0x6d, 0x6d, 0x75, 0x6e, 0x69, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73,
+  0x20, 0x6d, 0x61, 0x64, 0x65, 0x20, 0x61, 0x74, 0x20, 0x61, 0x6e, 0x79, 0x20,
+  0x74, 0x69, 0x6d, 0x65, 0x20, 0x6f, 0x72, 0x20, 0x70, 0x6c, 0x61, 0x63, 0x65,
+  0x2c, 0x20, 0x77, 0x68, 0x69, 0x63, 0x68, 0x20, 0x61, 0x72, 0x65, 0x20, 0x61,
+  0x64, 0x64, 0x72, 0x65, 0x73, 0x73, 0x65, 0x64, 0x20, 0x74, 0x6f
+};
+
+const static struct
+{
+  char *name;
+  u32 len;
+  const u8 key[32];
+  const u8 *msg;
+  const u8 out[16];
+} test_cases[] = {
+  {
+    .name = "test1",
+    .len = 34,
+    .out = { 0xa8, 0x06, 0x1d, 0xc1, 0x30, 0x51, 0x36, 0xc6, 0xc2, 0x2b, 0x8b,
+	     0xaf, 0x0c, 0x01, 0x27, 0xa9 },
+    .key = { 0x85, 0xd6, 0xbe, 0x78, 0x57, 0x55, 0x6d, 0x33, 0x7f, 0x44, 0x52,
+	     0xfe, 0x42, 0xd5, 0x06, 0xa8, 0x01, 0x03, 0x80, 0x8a, 0xfb, 0x0d,
+	     0xb2, 0xfd, 0x4a, 0xbf, 0xf6, 0xaf, 0x41, 0x49, 0xf5, 0x1b },
+    .msg = (u8[34]){ 0x43, 0x72, 0x79, 0x70, 0x74, 0x6f, 0x67, 0x72, 0x61,
+		     0x70, 0x68, 0x69, 0x63, 0x20, 0x46, 0x6f, 0x72, 0x75,
+		     0x6d, 0x20, 0x52, 0x65, 0x73, 0x65, 0x61, 0x72, 0x63,
+		     0x68, 0x20, 0x47, 0x72, 0x6f, 0x75, 0x70 },
+  },
+  {
+    .name = "RFC8439 A3 TV1",
+    .len = 64,
+    .out = {},
+    .key = {},
+    .msg = (u8[64]){},
+  },
+  {
+    .name = "RFC8439 A3 TV2",
+    .len = sizeof (text1),
+    .out = { 0x36, 0xe5, 0xf6, 0xb5, 0xc5, 0xe0, 0x60, 0x70, 0xf0, 0xef, 0xca,
+	     0x96, 0x22, 0x7a, 0x86, 0x3e },
+    .key = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	     0x00, 0x00, 0x00, 0x00, 0x00, 0x36, 0xe5, 0xf6, 0xb5, 0xc5, 0xe0,
+	     0x60, 0x70, 0xf0, 0xef, 0xca, 0x96, 0x22, 0x7a, 0x86, 0x3e },
+    .msg = text1,
+  },
+  {
+    .name = "RFC8439 A3 TV3",
+    .len = sizeof (text1),
+    .out = { 0xf3, 0x47, 0x7e, 0x7c, 0xd9, 0x54, 0x17, 0xaf, 0x89, 0xa6, 0xb8,
+	     0x79, 0x4c, 0x31, 0x0c, 0xf0
+
+    },
+    .key = { 0x36, 0xe5, 0xf6, 0xb5, 0xc5, 0xe0, 0x60, 0x70, 0xf0, 0xef, 0xca,
+	     0x96, 0x22, 0x7a, 0x86, 0x3e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+
+    .msg = text1,
+  },
+  {
+    .name = "RFC8439 A3 TV4",
+    .len = 127,
+    .key = { 0x1c, 0x92, 0x40, 0xa5, 0xeb, 0x55, 0xd3, 0x8a, 0xf3, 0x33, 0x88,
+	     0x86, 0x04, 0xf6, 0xb5, 0xf0, 0x47, 0x39, 0x17, 0xc1, 0x40, 0x2b,
+	     0x80, 0x09, 0x9d, 0xca, 0x5c, 0xbc, 0x20, 0x70, 0x75, 0xc0 },
+    .msg =
+      (u8[127]){
+	0x27, 0x54, 0x77, 0x61, 0x73, 0x20, 0x62, 0x72, 0x69, 0x6c, 0x6c, 0x69,
+	0x67, 0x2c, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x74, 0x68, 0x65, 0x20, 0x73,
+	0x6c, 0x69, 0x74, 0x68, 0x79, 0x20, 0x74, 0x6f, 0x76, 0x65, 0x73, 0x0a,
+	0x44, 0x69, 0x64, 0x20, 0x67, 0x79, 0x72, 0x65, 0x20, 0x61, 0x6e, 0x64,
+	0x20, 0x67, 0x69, 0x6d, 0x62, 0x6c, 0x65, 0x20, 0x69, 0x6e, 0x20, 0x74,
+	0x68, 0x65, 0x20, 0x77, 0x61, 0x62, 0x65, 0x3a, 0x0a, 0x41, 0x6c, 0x6c,
+	0x20, 0x6d, 0x69, 0x6d, 0x73, 0x79, 0x20, 0x77, 0x65, 0x72, 0x65, 0x20,
+	0x74, 0x68, 0x65, 0x20, 0x62, 0x6f, 0x72, 0x6f, 0x67, 0x6f, 0x76, 0x65,
+	0x73, 0x2c, 0x0a, 0x41, 0x6e, 0x64, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d,
+	0x6f, 0x6d, 0x65, 0x20, 0x72, 0x61, 0x74, 0x68, 0x73, 0x20, 0x6f, 0x75,
+	0x74, 0x67, 0x72, 0x61, 0x62, 0x65, 0x2e },
+    .out = { 0x45, 0x41, 0x66, 0x9a, 0x7e, 0xaa, 0xee, 0x61, 0xe7, 0x08, 0xdc,
+	     0x7c, 0xbc, 0xc5, 0xeb, 0x62 },
+  },
+  {
+    /* Test Vector #5:
+     * If one uses 130-bit partial reduction, does the code handle the case
+     * where partially reduced final result is not fully reduced? */
+    .name = "RFC8439 A3 TV5",
+    .len = 16,
+    .key = { 2 },
+    .msg = (u8[16]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+    .out = { 3 },
+  },
+  {
+    /* Test Vector #6:
+     * What happens if addition of s overflows modulo 2^128? */
+    .name = "RFC8439 A3 TV6",
+    .len = 16,
+    .key = { 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	     0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+    .msg = (u8[16]){ 2 },
+    .out = { 3 },
+  },
+  {
+    /* Test Vector #7:
+     * What happens if data limb is all ones and there is carry from lower
+     * limb? */
+    .name = "RFC8439 A3 TV7",
+    .len = 48,
+    .key = { 1 },
+    .msg =
+      (u8[48]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xf0, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    .out = { 5 },
+  },
+  {
+    /* Test Vector #8:
+     * What happens if final result from polynomial part is exactly 2^130-5? */
+    .name = "RFC8439 A3 TV8",
+    .len = 48,
+    .key = { 1 },
+    .msg =
+      (u8[48]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfb, 0xfe, 0xfe, 0xfe,
+		0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
+		0xfe, 0xfe, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 },
+    .out = { 0 },
+  },
+  {
+    /* Test Vector #9:
+     * What happens if final result from polynomial part is exactly 2^130-6? */
+    .name = "RFC8439 A3 TV9",
+    .len = 16,
+    .key = { 2 },
+    .msg = (u8[16]){ 0xfd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+    .out = { 0xfa, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	     0xff, 0xff, 0xff, 0xff, 0xff },
+  },
+  {
+    /* Test Vector #10:
+     * What happens if 5*H+L-type reduction produces 131-bit intermediate
+     * result? */
+    .name = "RFC8439 A3 TV10",
+    .len = 64,
+    .key = { [0] = 1, [8] = 4 },
+    .msg =
+      (u8[64]){ 0xE3, 0x35, 0x94, 0xD7, 0x50, 0x5E, 0x43, 0xB9, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x33, 0x94, 0xD7, 0x50,
+		0x5E, 0x43, 0x79, 0xCD, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00 },
+    .out = { 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00,
+	     0x00, 0x00, 0x00, 0x00, 0x00 },
+  },
+  {
+    /* Test Vector #11:
+     * What happens if 5*H+L-type reduction produces 131-bit final result? */
+    .name = "RFC8439 A3 TV11",
+    .len = 48,
+    .key = { [0] = 1, [8] = 4 },
+    .msg =
+      (u8[48]){ 0xE3, 0x35, 0x94, 0xD7, 0x50, 0x5E, 0x43, 0xB9, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x33, 0x94, 0xD7, 0x50,
+		0x5E, 0x43, 0x79, 0xCD, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    .out = { 0x13 },
+  }
+};
+
+static clib_error_t *
+test_clib_poly1305 (clib_error_t *err)
+{
+  u8 out[16] = {};
+
+  FOREACH_ARRAY_ELT (tc, test_cases)
+    {
+      clib_poly1305 (tc->key, tc->msg, tc->len, out);
+      if (memcmp (out, tc->out, 16) != 0)
+	err = clib_error_return (
+	  err,
+	  "\ntest:     %s"
+	  "\nkey:      %U"
+	  "\ndata:     %U"
+	  "\nexp out:  %U"
+	  "\ncalc out: %U\n",
+	  tc->name, format_hexdump, tc->key, 32, format_hexdump, tc->msg,
+	  tc->len, format_hexdump, tc->out, 16, format_hexdump, out, 16);
+    }
+  return err;
+}
+
+void __test_perf_fn
+perftest_64byte (test_perf_t *tp)
+{
+  u32 n = tp->n_ops;
+  u8 *m = test_mem_alloc_and_fill_inc_u8 (n * 64, 0, 0);
+  u8 *k = test_mem_alloc_and_fill_inc_u8 (n * 32, 0, 0);
+  u8 *t = test_mem_alloc (n * 16);
+
+  test_perf_event_enable (tp);
+  for (int i = 0; i < n; i++, t += 16, k += 32, m += 64)
+    clib_poly1305 (k, m, 64, t);
+  test_perf_event_disable (tp);
+}
+
+void __test_perf_fn
+perftest_byte (test_perf_t *tp)
+{
+  u32 n = tp->n_ops;
+
+  u8 *m = test_mem_alloc_and_fill_inc_u8 (n, 0, 0);
+  u8 *k = test_mem_alloc_and_fill_inc_u8 (32, 0, 0);
+  u8 *t = test_mem_alloc (16);
+
+  test_perf_event_enable (tp);
+  clib_poly1305 (k, m, n, t);
+  test_perf_event_disable (tp);
+}
+
+REGISTER_TEST (clib_poly1305) = {
+  .name = "clib_poly1305",
+  .fn = test_clib_poly1305,
+  .perf_tests = PERF_TESTS (
+    { .name = "fixed size (64 bytes)", .n_ops = 1024, .fn = perftest_64byte },
+    { .name = "variable size (per byte)",
+      .n_ops = 16384,
+      .fn = perftest_byte }),
+};
diff --git a/src/vppinfra/types.h b/src/vppinfra/types.h
index 7841823..a3fbfcc 100644
--- a/src/vppinfra/types.h
+++ b/src/vppinfra/types.h
@@ -57,12 +57,8 @@
 typedef unsigned short u16;
 #endif /* ! CLIB_LINUX_KERNEL */
 
-#if defined (__x86_64__)
-#ifndef __COVERITY__
-typedef signed int i128 __attribute__ ((mode (TI)));
-typedef unsigned int u128 __attribute__ ((mode (TI)));
-#endif
-#endif
+typedef signed __int128 i128;
+typedef unsigned __int128 u128;
 
 #if (defined(i386) || (defined(_mips) && __mips != 64) || defined(powerpc) || defined (__SPU__) || defined(__sparc__) || defined(__arm__) || defined (__xtensa__) || defined(__TMS320C6X__))
 typedef signed int i32;