blob: cd6ea60cdf7835e4d113d528eceefb14918dea82 [file] [log] [blame]
Damjan Marion003330c2023-04-12 12:19:05 +00001/* SPDX-License-Identifier: Apache-2.0
2 * Copyright(c) 2023 Cisco Systems, Inc.
3 */
4
5#ifndef __clib_poly1305_h__
6#define __clib_poly1305_h__
7
8#include <vppinfra/clib.h>
9#include <vppinfra/vector.h>
10#include <vppinfra/cache.h>
11#include <vppinfra/string.h>
12
13/* implementation of DJB's poly1305 using 64-bit arithmetrics */
14
15typedef struct
16{
17 const u64 r[3], s[2];
18 u64 h[3];
19
20 /* partial data */
21 union
22 {
23 u8 as_u8[16];
24 u64 as_u64[2];
25 } partial;
26
27 size_t n_partial_bytes;
28} clib_poly1305_ctx;
29
30static_always_inline void
31clib_poly1305_init (clib_poly1305_ctx *ctx, const u8 key[32])
32{
33 u64u *k = (u64u *) key;
34 u64 *h = (u64 *) ctx->h;
35 u64 *r = (u64 *) ctx->r;
36 u64 *s = (u64 *) ctx->s;
37
38 /* initialize accumulator */
39 h[0] = h[1] = h[2] = 0;
40
41 /* clamp 1st half of the key and store it into r[] */
42 r[0] = k[0] & 0x0ffffffc0fffffff;
43 r[1] = k[1] & 0x0ffffffc0ffffffc;
44 s[0] = k[2];
45 s[1] = k[3];
46
47 /* precompute (r[1] >> 2) * 5 */
48 r[2] = r[1] + (r[1] >> 2);
49
50 ctx->n_partial_bytes = 0;
51}
52
53static_always_inline void
54_clib_poly1305_multiply_and_reduce (u64 h[3], const u64 r[3])
55{
56 union
57 {
58 struct
59 {
60 u64 lo, hi;
61 };
62 u128 n;
63 } l0, l1, l2;
64 u64 c;
65
66 /*
67 h2 h1 h0
68 x r1 r0
69 ---------------------------------------
70 r0 x h2 r0 x h1 r0 × h0
71 + r1 x h2 r1 x h1 r1 x h0
72 ---------------------------------------
73
74 for p = 2^130-5, following applies:
75 (r * 2^130) mod p == (r * 5) mod p
76
77 bits above 130 can be shifted right (divided by 2^130)
78 and multiplied by 5 per equation above
79
80 h2 h1 h0
81 x r1 r0
82 ----------------------------------------------
83 r0 x h2 r0 x h1 r0 × h0
84 + r1 x h0
85 + 5x (r1 >>2) x h2 5x (r1 >>2) x h1
86 ----------------------------------------------
87 [0:l2.lo] [l1.hi:l1.lo] [l0.hi:l0.lo]
88 */
89
90 l0.n = l1.n = l2.n = 0;
91 /* u64 x u64 = u128 multiplications */
92 l0.n += (u128) h[0] * r[0];
93 l0.n += (u128) h[1] * r[2]; /* r[2] holds precomputed (r[1] >> 2) * 5 */
94 l1.n += (u128) h[0] * r[1];
95 l1.n += (u128) h[1] * r[0];
96
97 /* u64 x u64 = u64 multiplications, as h[2] may have only lower 2 bits set
98 * and r[1] have clamped bits 60-63 */
99 l1.n += (u128) (h[2] * r[2]);
100 l2.n += (u128) (h[2] * r[0]);
101
102 /* propagate upper 64 bits to higher limb */
103 c = 0;
104 l1.lo = u64_add_with_carry (&c, l1.lo, l0.hi);
105 l2.lo = u64_add_with_carry (&c, l2.lo, l1.hi);
106
107 l2.hi = l2.lo;
108 /* keep bits [128:129] */
109 l2.lo &= 3;
110
111 /* bits 130 and above multiply with 5 and store to l2.hi */
112 l2.hi -= l2.lo;
113 l2.hi += l2.hi >> 2;
114
115 /* add l2.hi to l0.lo with carry propagation and store result to h2:h1:h0 */
116 c = 0;
117 h[0] = u64_add_with_carry (&c, l0.lo, l2.hi);
118 h[1] = u64_add_with_carry (&c, l1.lo, 0);
119 h[2] = u64_add_with_carry (&c, l2.lo, 0);
120}
121
122static_always_inline u32
123_clib_poly1305_add_blocks (clib_poly1305_ctx *ctx, const u8 *msg,
124 uword n_bytes, const u32 bit17)
125{
126 u64 r[3], h[3];
127
128 for (int i = 0; i < 3; i++)
129 {
130 h[i] = ctx->h[i];
131 r[i] = ctx->r[i];
132 }
133
134 for (const u64u *m = (u64u *) msg; n_bytes >= 16; n_bytes -= 16, m += 2)
135 {
136 u64 c = 0;
137
138 /* h += m */
139 h[0] = u64_add_with_carry (&c, h[0], m[0]);
140 h[1] = u64_add_with_carry (&c, h[1], m[1]);
141 h[2] = u64_add_with_carry (&c, h[2], bit17 ? 1 : 0);
142
143 /* h = (h * r) mod p */
144 _clib_poly1305_multiply_and_reduce (h, r);
145 }
146
147 for (int i = 0; i < 3; i++)
148 ctx->h[i] = h[i];
149
150 return n_bytes;
151}
152
153static_always_inline void
154clib_poly1305_update (clib_poly1305_ctx *ctx, const u8 *msg, uword len)
155{
156 uword n_left = len;
157
158 if (n_left == 0)
159 return;
160
161 if (ctx->n_partial_bytes)
162 {
163 u16 missing_bytes = 16 - ctx->n_partial_bytes;
164 if (PREDICT_FALSE (n_left < missing_bytes))
165 {
166 clib_memcpy_fast (ctx->partial.as_u8 + ctx->n_partial_bytes, msg,
167 n_left);
168 ctx->n_partial_bytes += n_left;
169 return;
170 }
171
172 clib_memcpy_fast (ctx->partial.as_u8 + ctx->n_partial_bytes, msg,
173 missing_bytes);
174 _clib_poly1305_add_blocks (ctx, ctx->partial.as_u8, 16, 1);
175 ctx->n_partial_bytes = 0;
176 n_left -= missing_bytes;
177 msg += missing_bytes;
178 }
179
180 n_left = _clib_poly1305_add_blocks (ctx, msg, n_left, 1);
181
182 if (n_left)
183 {
184 ctx->partial.as_u64[0] = ctx->partial.as_u64[1] = 0;
185 clib_memcpy_fast (ctx->partial.as_u8, msg + len - n_left, n_left);
186 ctx->n_partial_bytes = n_left;
187 }
188}
189
190static_always_inline void
191clib_poly1305_final (clib_poly1305_ctx *ctx, u8 *out)
192{
193 const u64 p[] = { 0xFFFFFFFFFFFFFFFB, 0xFFFFFFFFFFFFFFFF, 3 }; /* 2^128-5 */
194 const u64 *s = ctx->s;
195 u64u *t = (u64u *) out;
196 u64 h0, h1, t0, t1;
197 u64 c;
198
199 if (ctx->n_partial_bytes)
200 {
201 ctx->partial.as_u8[ctx->n_partial_bytes] = 1;
202 _clib_poly1305_add_blocks (ctx, ctx->partial.as_u8, 16, 0);
203 }
204
205 h0 = ctx->h[0];
206 h1 = ctx->h[1];
207
208 /* h may not be fully reduced, try to subtract 2^128-5 */
209 c = 0;
210 t0 = u64_sub_with_borrow (&c, h0, p[0]);
211 t1 = u64_sub_with_borrow (&c, h1, p[1]);
212 u64_sub_with_borrow (&c, ctx->h[2], p[2]);
213
214 if (!c)
215 {
216 h0 = t0;
217 h1 = t1;
218 }
219
220 c = 0;
221 t[0] = u64_add_with_carry (&c, h0, s[0]);
222 t[1] = u64_add_with_carry (&c, h1, s[1]);
223}
224
225static_always_inline void
226clib_poly1305 (const u8 *key, const u8 *msg, uword len, u8 *out)
227{
228 clib_poly1305_ctx ctx;
229 clib_poly1305_init (&ctx, key);
230 clib_poly1305_update (&ctx, msg, len);
231 clib_poly1305_final (&ctx, out);
232}
233
234#endif /* __clib_poly1305_h__ */