Damjan Marion | 003330c | 2023-04-12 12:19:05 +0000 | [diff] [blame] | 1 | /* SPDX-License-Identifier: Apache-2.0 |
| 2 | * Copyright(c) 2023 Cisco Systems, Inc. |
| 3 | */ |
| 4 | |
| 5 | #ifndef __clib_poly1305_h__ |
| 6 | #define __clib_poly1305_h__ |
| 7 | |
| 8 | #include <vppinfra/clib.h> |
| 9 | #include <vppinfra/vector.h> |
| 10 | #include <vppinfra/cache.h> |
| 11 | #include <vppinfra/string.h> |
| 12 | |
| 13 | /* implementation of DJB's poly1305 using 64-bit arithmetrics */ |
| 14 | |
| 15 | typedef struct |
| 16 | { |
| 17 | const u64 r[3], s[2]; |
| 18 | u64 h[3]; |
| 19 | |
| 20 | /* partial data */ |
| 21 | union |
| 22 | { |
| 23 | u8 as_u8[16]; |
| 24 | u64 as_u64[2]; |
| 25 | } partial; |
| 26 | |
| 27 | size_t n_partial_bytes; |
| 28 | } clib_poly1305_ctx; |
| 29 | |
| 30 | static_always_inline void |
| 31 | clib_poly1305_init (clib_poly1305_ctx *ctx, const u8 key[32]) |
| 32 | { |
| 33 | u64u *k = (u64u *) key; |
| 34 | u64 *h = (u64 *) ctx->h; |
| 35 | u64 *r = (u64 *) ctx->r; |
| 36 | u64 *s = (u64 *) ctx->s; |
| 37 | |
| 38 | /* initialize accumulator */ |
| 39 | h[0] = h[1] = h[2] = 0; |
| 40 | |
| 41 | /* clamp 1st half of the key and store it into r[] */ |
| 42 | r[0] = k[0] & 0x0ffffffc0fffffff; |
| 43 | r[1] = k[1] & 0x0ffffffc0ffffffc; |
| 44 | s[0] = k[2]; |
| 45 | s[1] = k[3]; |
| 46 | |
| 47 | /* precompute (r[1] >> 2) * 5 */ |
| 48 | r[2] = r[1] + (r[1] >> 2); |
| 49 | |
| 50 | ctx->n_partial_bytes = 0; |
| 51 | } |
| 52 | |
| 53 | static_always_inline void |
| 54 | _clib_poly1305_multiply_and_reduce (u64 h[3], const u64 r[3]) |
| 55 | { |
| 56 | union |
| 57 | { |
| 58 | struct |
| 59 | { |
| 60 | u64 lo, hi; |
| 61 | }; |
| 62 | u128 n; |
| 63 | } l0, l1, l2; |
| 64 | u64 c; |
| 65 | |
| 66 | /* |
| 67 | h2 h1 h0 |
| 68 | x r1 r0 |
| 69 | --------------------------------------- |
| 70 | r0 x h2 r0 x h1 r0 × h0 |
| 71 | + r1 x h2 r1 x h1 r1 x h0 |
| 72 | --------------------------------------- |
| 73 | |
| 74 | for p = 2^130-5, following applies: |
| 75 | (r * 2^130) mod p == (r * 5) mod p |
| 76 | |
| 77 | bits above 130 can be shifted right (divided by 2^130) |
| 78 | and multiplied by 5 per equation above |
| 79 | |
| 80 | h2 h1 h0 |
| 81 | x r1 r0 |
| 82 | ---------------------------------------------- |
| 83 | r0 x h2 r0 x h1 r0 × h0 |
| 84 | + r1 x h0 |
| 85 | + 5x (r1 >>2) x h2 5x (r1 >>2) x h1 |
| 86 | ---------------------------------------------- |
| 87 | [0:l2.lo] [l1.hi:l1.lo] [l0.hi:l0.lo] |
| 88 | */ |
| 89 | |
| 90 | l0.n = l1.n = l2.n = 0; |
| 91 | /* u64 x u64 = u128 multiplications */ |
| 92 | l0.n += (u128) h[0] * r[0]; |
| 93 | l0.n += (u128) h[1] * r[2]; /* r[2] holds precomputed (r[1] >> 2) * 5 */ |
| 94 | l1.n += (u128) h[0] * r[1]; |
| 95 | l1.n += (u128) h[1] * r[0]; |
| 96 | |
| 97 | /* u64 x u64 = u64 multiplications, as h[2] may have only lower 2 bits set |
| 98 | * and r[1] have clamped bits 60-63 */ |
| 99 | l1.n += (u128) (h[2] * r[2]); |
| 100 | l2.n += (u128) (h[2] * r[0]); |
| 101 | |
| 102 | /* propagate upper 64 bits to higher limb */ |
| 103 | c = 0; |
| 104 | l1.lo = u64_add_with_carry (&c, l1.lo, l0.hi); |
| 105 | l2.lo = u64_add_with_carry (&c, l2.lo, l1.hi); |
| 106 | |
| 107 | l2.hi = l2.lo; |
| 108 | /* keep bits [128:129] */ |
| 109 | l2.lo &= 3; |
| 110 | |
| 111 | /* bits 130 and above multiply with 5 and store to l2.hi */ |
| 112 | l2.hi -= l2.lo; |
| 113 | l2.hi += l2.hi >> 2; |
| 114 | |
| 115 | /* add l2.hi to l0.lo with carry propagation and store result to h2:h1:h0 */ |
| 116 | c = 0; |
| 117 | h[0] = u64_add_with_carry (&c, l0.lo, l2.hi); |
| 118 | h[1] = u64_add_with_carry (&c, l1.lo, 0); |
| 119 | h[2] = u64_add_with_carry (&c, l2.lo, 0); |
| 120 | } |
| 121 | |
| 122 | static_always_inline u32 |
| 123 | _clib_poly1305_add_blocks (clib_poly1305_ctx *ctx, const u8 *msg, |
| 124 | uword n_bytes, const u32 bit17) |
| 125 | { |
| 126 | u64 r[3], h[3]; |
| 127 | |
| 128 | for (int i = 0; i < 3; i++) |
| 129 | { |
| 130 | h[i] = ctx->h[i]; |
| 131 | r[i] = ctx->r[i]; |
| 132 | } |
| 133 | |
| 134 | for (const u64u *m = (u64u *) msg; n_bytes >= 16; n_bytes -= 16, m += 2) |
| 135 | { |
| 136 | u64 c = 0; |
| 137 | |
| 138 | /* h += m */ |
| 139 | h[0] = u64_add_with_carry (&c, h[0], m[0]); |
| 140 | h[1] = u64_add_with_carry (&c, h[1], m[1]); |
| 141 | h[2] = u64_add_with_carry (&c, h[2], bit17 ? 1 : 0); |
| 142 | |
| 143 | /* h = (h * r) mod p */ |
| 144 | _clib_poly1305_multiply_and_reduce (h, r); |
| 145 | } |
| 146 | |
| 147 | for (int i = 0; i < 3; i++) |
| 148 | ctx->h[i] = h[i]; |
| 149 | |
| 150 | return n_bytes; |
| 151 | } |
| 152 | |
| 153 | static_always_inline void |
| 154 | clib_poly1305_update (clib_poly1305_ctx *ctx, const u8 *msg, uword len) |
| 155 | { |
| 156 | uword n_left = len; |
| 157 | |
| 158 | if (n_left == 0) |
| 159 | return; |
| 160 | |
| 161 | if (ctx->n_partial_bytes) |
| 162 | { |
| 163 | u16 missing_bytes = 16 - ctx->n_partial_bytes; |
| 164 | if (PREDICT_FALSE (n_left < missing_bytes)) |
| 165 | { |
| 166 | clib_memcpy_fast (ctx->partial.as_u8 + ctx->n_partial_bytes, msg, |
| 167 | n_left); |
| 168 | ctx->n_partial_bytes += n_left; |
| 169 | return; |
| 170 | } |
| 171 | |
| 172 | clib_memcpy_fast (ctx->partial.as_u8 + ctx->n_partial_bytes, msg, |
| 173 | missing_bytes); |
| 174 | _clib_poly1305_add_blocks (ctx, ctx->partial.as_u8, 16, 1); |
| 175 | ctx->n_partial_bytes = 0; |
| 176 | n_left -= missing_bytes; |
| 177 | msg += missing_bytes; |
| 178 | } |
| 179 | |
| 180 | n_left = _clib_poly1305_add_blocks (ctx, msg, n_left, 1); |
| 181 | |
| 182 | if (n_left) |
| 183 | { |
| 184 | ctx->partial.as_u64[0] = ctx->partial.as_u64[1] = 0; |
| 185 | clib_memcpy_fast (ctx->partial.as_u8, msg + len - n_left, n_left); |
| 186 | ctx->n_partial_bytes = n_left; |
| 187 | } |
| 188 | } |
| 189 | |
| 190 | static_always_inline void |
| 191 | clib_poly1305_final (clib_poly1305_ctx *ctx, u8 *out) |
| 192 | { |
| 193 | const u64 p[] = { 0xFFFFFFFFFFFFFFFB, 0xFFFFFFFFFFFFFFFF, 3 }; /* 2^128-5 */ |
| 194 | const u64 *s = ctx->s; |
| 195 | u64u *t = (u64u *) out; |
| 196 | u64 h0, h1, t0, t1; |
| 197 | u64 c; |
| 198 | |
| 199 | if (ctx->n_partial_bytes) |
| 200 | { |
| 201 | ctx->partial.as_u8[ctx->n_partial_bytes] = 1; |
| 202 | _clib_poly1305_add_blocks (ctx, ctx->partial.as_u8, 16, 0); |
| 203 | } |
| 204 | |
| 205 | h0 = ctx->h[0]; |
| 206 | h1 = ctx->h[1]; |
| 207 | |
| 208 | /* h may not be fully reduced, try to subtract 2^128-5 */ |
| 209 | c = 0; |
| 210 | t0 = u64_sub_with_borrow (&c, h0, p[0]); |
| 211 | t1 = u64_sub_with_borrow (&c, h1, p[1]); |
| 212 | u64_sub_with_borrow (&c, ctx->h[2], p[2]); |
| 213 | |
| 214 | if (!c) |
| 215 | { |
| 216 | h0 = t0; |
| 217 | h1 = t1; |
| 218 | } |
| 219 | |
| 220 | c = 0; |
| 221 | t[0] = u64_add_with_carry (&c, h0, s[0]); |
| 222 | t[1] = u64_add_with_carry (&c, h1, s[1]); |
| 223 | } |
| 224 | |
| 225 | static_always_inline void |
| 226 | clib_poly1305 (const u8 *key, const u8 *msg, uword len, u8 *out) |
| 227 | { |
| 228 | clib_poly1305_ctx ctx; |
| 229 | clib_poly1305_init (&ctx, key); |
| 230 | clib_poly1305_update (&ctx, msg, len); |
| 231 | clib_poly1305_final (&ctx, out); |
| 232 | } |
| 233 | |
| 234 | #endif /* __clib_poly1305_h__ */ |