blob: 9bd5c6832177ad33dda609350764606fffdba3c2 [file] [log] [blame]
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001/*
2 * Copyright (C) 2021 Denys Vlasenko
3 *
4 * Licensed under GPLv2, see file LICENSE in this source tree.
5 */
6#include "tls.h"
7
8#define SP_DEBUG 0
9#define FIXED_SECRET 0
10#define FIXED_PEER_PUBKEY 0
11
Denys Vlasenko3b411eb2021-10-05 20:00:50 +020012#define ALLOW_ASM 1
13
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +020014#if SP_DEBUG
15# define dbg(...) fprintf(stderr, __VA_ARGS__)
16static void dump_hex(const char *fmt, const void *vp, int len)
17{
18 char hexbuf[32 * 1024 + 4];
19 const uint8_t *p = vp;
20
21 bin2hex(hexbuf, (void*)p, len)[0] = '\0';
22 dbg(fmt, hexbuf);
23}
24#else
25# define dbg(...) ((void)0)
26# define dump_hex(...) ((void)0)
27#endif
28
Denys Vlasenko3b411eb2021-10-05 20:00:50 +020029typedef uint32_t sp_digit;
30typedef int32_t signed_sp_digit;
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +020031
Denys Vlasenko4bc9da12021-11-27 11:28:11 +010032/* 64-bit optimizations:
33 * if BB_UNALIGNED_MEMACCESS_OK && ULONG_MAX > 0xffffffff,
34 * then loads and stores can be done in 64-bit chunks.
35 *
36 * A narrower case is when arch is also little-endian (such as x86_64),
37 * then "LSW first", uint32[8] and uint64[4] representations are equivalent,
38 * and arithmetic can be done in 64 bits too.
39 */
40#if defined(__GNUC__) && defined(__x86_64__)
41# define UNALIGNED_LE_64BIT 1
42#else
43# define UNALIGNED_LE_64BIT 0
44#endif
45
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +020046/* The code below is taken from parts of
47 * wolfssl-3.15.3/wolfcrypt/src/sp_c32.c
48 * and heavily modified.
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +020049 */
50
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +020051typedef struct sp_point {
Denys Vlasenko1b93c7c2021-11-28 02:56:02 +010052 sp_digit x[8]
53#if ULONG_MAX > 0xffffffff
54 /* Make sp_point[] arrays to not be 64-bit misaligned */
55 ALIGNED(8)
56#endif
57 ;
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +010058 sp_digit y[8];
59 sp_digit z[8];
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +020060 int infinity;
61} sp_point;
62
63/* The modulus (prime) of the curve P256. */
Denys Vlasenko1b93c7c2021-11-28 02:56:02 +010064static const sp_digit p256_mod[8] ALIGNED(8) = {
Denys Vlasenko3b411eb2021-10-05 20:00:50 +020065 0xffffffff,0xffffffff,0xffffffff,0x00000000,
66 0x00000000,0x00000000,0x00000001,0xffffffff,
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +020067};
68
69#define p256_mp_mod ((sp_digit)0x000001)
70
Denys Vlasenko3b411eb2021-10-05 20:00:50 +020071/* Normalize the values in each word to 32 bits - NOP */
72#define sp_256_norm_8(a) ((void)0)
Denys Vlasenko77145182021-10-01 13:51:39 +020073
Denys Vlasenkoe7305052021-10-05 13:30:48 +020074/* Write r as big endian to byte array.
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +020075 * Fixed length number of bytes written: 32
76 *
77 * r A single precision integer.
78 * a Byte array.
79 */
Denys Vlasenko4bc9da12021-11-27 11:28:11 +010080#if BB_UNALIGNED_MEMACCESS_OK && ULONG_MAX > 0xffffffff
81static void sp_256_to_bin_8(const sp_digit* rr, uint8_t* a)
82{
83 int i;
84 const uint64_t* r = (void*)rr;
85
86 sp_256_norm_8(rr);
87
88 r += 4;
89 for (i = 0; i < 4; i++) {
90 r--;
91 move_to_unaligned64(a, SWAP_BE64(*r));
92 a += 8;
93 }
94}
95#else
Denys Vlasenko3b411eb2021-10-05 20:00:50 +020096static void sp_256_to_bin_8(const sp_digit* r, uint8_t* a)
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +020097{
Denys Vlasenko3b411eb2021-10-05 20:00:50 +020098 int i;
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +020099
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200100 sp_256_norm_8(r);
Denys Vlasenko77145182021-10-01 13:51:39 +0200101
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200102 r += 8;
103 for (i = 0; i < 8; i++) {
104 r--;
105 move_to_unaligned32(a, SWAP_BE32(*r));
106 a += 4;
Denys Vlasenko12040122021-04-26 20:24:34 +0200107 }
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200108}
Denys Vlasenko4bc9da12021-11-27 11:28:11 +0100109#endif
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200110
Denys Vlasenkoe7305052021-10-05 13:30:48 +0200111/* Read big endian unsigned byte array into r.
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200112 *
113 * r A single precision integer.
114 * a Byte array.
115 * n Number of bytes in array to read.
116 */
Denys Vlasenko4bc9da12021-11-27 11:28:11 +0100117#if BB_UNALIGNED_MEMACCESS_OK && ULONG_MAX > 0xffffffff
118static void sp_256_from_bin_8(sp_digit* rr, const uint8_t* a)
119{
120 int i;
121 uint64_t* r = (void*)rr;
122
123 r += 4;
124 for (i = 0; i < 4; i++) {
125 uint64_t v;
126 move_from_unaligned64(v, a);
127 *--r = SWAP_BE64(v);
128 a += 8;
129 }
130}
131#else
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200132static void sp_256_from_bin_8(sp_digit* r, const uint8_t* a)
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200133{
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200134 int i;
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200135
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200136 r += 8;
137 for (i = 0; i < 8; i++) {
138 sp_digit v;
139 move_from_unaligned32(v, a);
140 *--r = SWAP_BE32(v);
141 a += 4;
Denys Vlasenko12040122021-04-26 20:24:34 +0200142 }
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200143}
Denys Vlasenko4bc9da12021-11-27 11:28:11 +0100144#endif
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200145
Denys Vlasenko137864f2021-10-05 13:47:42 +0200146#if SP_DEBUG
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200147static void dump_256(const char *fmt, const sp_digit* r)
Denys Vlasenko137864f2021-10-05 13:47:42 +0200148{
Denys Vlasenko137864f2021-10-05 13:47:42 +0200149 uint8_t b32[32];
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200150 sp_256_to_bin_8(r, b32);
Denys Vlasenko137864f2021-10-05 13:47:42 +0200151 dump_hex(fmt, b32, 32);
152}
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200153static void dump_512(const char *fmt, const sp_digit* r)
Denys Vlasenko137864f2021-10-05 13:47:42 +0200154{
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200155 uint8_t b64[64];
156 sp_256_to_bin_8(r, b64 + 32);
157 sp_256_to_bin_8(r+8, b64);
158 dump_hex(fmt, b64, 64);
Denys Vlasenko137864f2021-10-05 13:47:42 +0200159}
160#else
161# define dump_256(...) ((void)0)
162# define dump_512(...) ((void)0)
163#endif
164
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200165/* Convert a point of big-endian 32-byte x,y pair to type sp_point. */
166static void sp_256_point_from_bin2x32(sp_point* p, const uint8_t *bin2x32)
167{
Denys Vlasenko12040122021-04-26 20:24:34 +0200168 memset(p, 0, sizeof(*p));
169 /*p->infinity = 0;*/
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200170 sp_256_from_bin_8(p->x, bin2x32);
171 sp_256_from_bin_8(p->y, bin2x32 + 32);
Denys Vlasenkoe7305052021-10-05 13:30:48 +0200172 p->z[0] = 1; /* p->z = 1 */
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200173}
174
Denys Vlasenkob3b17132021-04-26 16:53:53 +0200175/* Compare a with b.
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200176 *
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200177 * return -ve, 0 or +ve if a is less than, equal to or greater than b
178 * respectively.
179 */
Denys Vlasenko4bc9da12021-11-27 11:28:11 +0100180#if UNALIGNED_LE_64BIT
181static signed_sp_digit sp_256_cmp_8(const sp_digit* aa, const sp_digit* bb)
182{
183 const uint64_t* a = (void*)aa;
184 const uint64_t* b = (void*)bb;
185 int i;
186 for (i = 3; i >= 0; i--) {
187 if (a[i] == b[i])
188 continue;
189 return (a[i] > b[i]) * 2 - 1;
190 }
191 return 0;
192}
193#else
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200194static signed_sp_digit sp_256_cmp_8(const sp_digit* a, const sp_digit* b)
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200195{
Denys Vlasenko12040122021-04-26 20:24:34 +0200196 int i;
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200197 for (i = 7; i >= 0; i--) {
198/* signed_sp_digit r = a[i] - b[i];
199 * if (r != 0)
200 * return r;
201 * does not work: think about a[i]=0, b[i]=0xffffffff
202 */
203 if (a[i] == b[i])
204 continue;
205 return (a[i] > b[i]) * 2 - 1;
Denys Vlasenko12040122021-04-26 20:24:34 +0200206 }
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200207 return 0;
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200208}
Denys Vlasenko4bc9da12021-11-27 11:28:11 +0100209#endif
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200210
211/* Compare two numbers to determine if they are equal.
212 *
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200213 * return 1 when equal and 0 otherwise.
214 */
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200215static int sp_256_cmp_equal_8(const sp_digit* a, const sp_digit* b)
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200216{
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200217 return sp_256_cmp_8(a, b) == 0;
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200218}
219
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200220/* Add b to a into r. (r = a + b). Return !0 on overflow */
221static int sp_256_add_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200222{
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200223#if ALLOW_ASM && defined(__GNUC__) && defined(__i386__)
224 sp_digit reg;
225 asm volatile (
226"\n movl (%0), %3"
227"\n addl (%1), %3"
228"\n movl %3, (%2)"
229"\n"
230"\n movl 1*4(%0), %3"
231"\n adcl 1*4(%1), %3"
232"\n movl %3, 1*4(%2)"
233"\n"
234"\n movl 2*4(%0), %3"
235"\n adcl 2*4(%1), %3"
236"\n movl %3, 2*4(%2)"
237"\n"
238"\n movl 3*4(%0), %3"
239"\n adcl 3*4(%1), %3"
240"\n movl %3, 3*4(%2)"
241"\n"
242"\n movl 4*4(%0), %3"
243"\n adcl 4*4(%1), %3"
244"\n movl %3, 4*4(%2)"
245"\n"
246"\n movl 5*4(%0), %3"
247"\n adcl 5*4(%1), %3"
248"\n movl %3, 5*4(%2)"
249"\n"
250"\n movl 6*4(%0), %3"
251"\n adcl 6*4(%1), %3"
252"\n movl %3, 6*4(%2)"
253"\n"
254"\n movl 7*4(%0), %3"
255"\n adcl 7*4(%1), %3"
256"\n movl %3, 7*4(%2)"
257"\n"
258"\n sbbl %3, %3"
259"\n"
260 : "=r" (a), "=r" (b), "=r" (r), "=r" (reg)
261 : "0" (a), "1" (b), "2" (r)
262 : "memory"
263 );
264 return reg;
Denys Vlasenko911344a2021-10-06 17:17:34 +0200265#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
Denys Vlasenko911344a2021-10-06 17:17:34 +0200266 uint64_t reg;
267 asm volatile (
268"\n movq (%0), %3"
269"\n addq (%1), %3"
270"\n movq %3, (%2)"
271"\n"
272"\n movq 1*8(%0), %3"
273"\n adcq 1*8(%1), %3"
274"\n movq %3, 1*8(%2)"
275"\n"
276"\n movq 2*8(%0), %3"
277"\n adcq 2*8(%1), %3"
278"\n movq %3, 2*8(%2)"
279"\n"
280"\n movq 3*8(%0), %3"
281"\n adcq 3*8(%1), %3"
282"\n movq %3, 3*8(%2)"
283"\n"
284"\n sbbq %3, %3"
285"\n"
286 : "=r" (a), "=r" (b), "=r" (r), "=r" (reg)
287 : "0" (a), "1" (b), "2" (r)
288 : "memory"
289 );
290 return reg;
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200291#else
Denys Vlasenko12040122021-04-26 20:24:34 +0200292 int i;
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200293 sp_digit carry;
294
295 carry = 0;
296 for (i = 0; i < 8; i++) {
297 sp_digit w, v;
298 w = b[i] + carry;
299 v = a[i];
300 if (w != 0) {
301 v = a[i] + w;
302 carry = (v < a[i]);
303 /* hope compiler detects above as "carry flag set" */
304 }
305 /* else: b + carry == 0, two cases:
306 * b:ffffffff, carry:1
307 * b:00000000, carry:0
308 * in either case, r[i] = a[i] and carry remains unchanged
309 */
310 r[i] = v;
311 }
312 return carry;
313#endif
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200314}
315
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200316/* Sub b from a into r. (r = a - b). Return !0 on underflow */
317static int sp_256_sub_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200318{
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200319#if ALLOW_ASM && defined(__GNUC__) && defined(__i386__)
320 sp_digit reg;
321 asm volatile (
322"\n movl (%0), %3"
323"\n subl (%1), %3"
324"\n movl %3, (%2)"
325"\n"
326"\n movl 1*4(%0), %3"
327"\n sbbl 1*4(%1), %3"
328"\n movl %3, 1*4(%2)"
329"\n"
330"\n movl 2*4(%0), %3"
331"\n sbbl 2*4(%1), %3"
332"\n movl %3, 2*4(%2)"
333"\n"
334"\n movl 3*4(%0), %3"
335"\n sbbl 3*4(%1), %3"
336"\n movl %3, 3*4(%2)"
337"\n"
338"\n movl 4*4(%0), %3"
339"\n sbbl 4*4(%1), %3"
340"\n movl %3, 4*4(%2)"
341"\n"
342"\n movl 5*4(%0), %3"
343"\n sbbl 5*4(%1), %3"
344"\n movl %3, 5*4(%2)"
345"\n"
346"\n movl 6*4(%0), %3"
347"\n sbbl 6*4(%1), %3"
348"\n movl %3, 6*4(%2)"
349"\n"
350"\n movl 7*4(%0), %3"
351"\n sbbl 7*4(%1), %3"
352"\n movl %3, 7*4(%2)"
353"\n"
354"\n sbbl %3, %3"
355"\n"
356 : "=r" (a), "=r" (b), "=r" (r), "=r" (reg)
357 : "0" (a), "1" (b), "2" (r)
358 : "memory"
359 );
360 return reg;
Denys Vlasenko911344a2021-10-06 17:17:34 +0200361#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
Denys Vlasenko911344a2021-10-06 17:17:34 +0200362 uint64_t reg;
363 asm volatile (
364"\n movq (%0), %3"
365"\n subq (%1), %3"
366"\n movq %3, (%2)"
367"\n"
368"\n movq 1*8(%0), %3"
369"\n sbbq 1*8(%1), %3"
370"\n movq %3, 1*8(%2)"
371"\n"
372"\n movq 2*8(%0), %3"
373"\n sbbq 2*8(%1), %3"
374"\n movq %3, 2*8(%2)"
375"\n"
376"\n movq 3*8(%0), %3"
377"\n sbbq 3*8(%1), %3"
378"\n movq %3, 3*8(%2)"
379"\n"
380"\n sbbq %3, %3"
381"\n"
382 : "=r" (a), "=r" (b), "=r" (r), "=r" (reg)
383 : "0" (a), "1" (b), "2" (r)
384 : "memory"
385 );
386 return reg;
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200387#else
Denys Vlasenko12040122021-04-26 20:24:34 +0200388 int i;
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200389 sp_digit borrow;
390
391 borrow = 0;
392 for (i = 0; i < 8; i++) {
393 sp_digit w, v;
394 w = b[i] + borrow;
395 v = a[i];
396 if (w != 0) {
397 v = a[i] - w;
398 borrow = (v > a[i]);
399 /* hope compiler detects above as "carry flag set" */
400 }
401 /* else: b + borrow == 0, two cases:
402 * b:ffffffff, borrow:1
403 * b:00000000, borrow:0
404 * in either case, r[i] = a[i] and borrow remains unchanged
405 */
406 r[i] = v;
407 }
408 return borrow;
409#endif
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200410}
411
Denys Vlasenko5e9c6172021-10-06 20:14:49 +0200412/* Sub p256_mod from r. (r = r - p256_mod). */
Denys Vlasenko87e3f2e2021-10-06 19:59:39 +0200413#if ALLOW_ASM && defined(__GNUC__) && defined(__i386__)
Denys Vlasenko5e9c6172021-10-06 20:14:49 +0200414static void sp_256_sub_8_p256_mod(sp_digit* r)
Denys Vlasenkoc7842842021-10-06 01:09:37 +0200415{
Denys Vlasenkoc7842842021-10-06 01:09:37 +0200416//p256_mod[7..0] = ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff
417 asm volatile (
Denys Vlasenko5e9c6172021-10-06 20:14:49 +0200418"\n subl $0xffffffff, (%0)"
419"\n sbbl $0xffffffff, 1*4(%0)"
420"\n sbbl $0xffffffff, 2*4(%0)"
421"\n sbbl $0, 3*4(%0)"
422"\n sbbl $0, 4*4(%0)"
423"\n sbbl $0, 5*4(%0)"
424"\n sbbl $1, 6*4(%0)"
425"\n sbbl $0xffffffff, 7*4(%0)"
Denys Vlasenkoc7842842021-10-06 01:09:37 +0200426"\n"
Denys Vlasenko5e9c6172021-10-06 20:14:49 +0200427 : "=r" (r)
428 : "0" (r)
Denys Vlasenkoc7842842021-10-06 01:09:37 +0200429 : "memory"
430 );
Denys Vlasenkoc7842842021-10-06 01:09:37 +0200431}
Denys Vlasenko87e3f2e2021-10-06 19:59:39 +0200432#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
Denys Vlasenko5e9c6172021-10-06 20:14:49 +0200433static void sp_256_sub_8_p256_mod(sp_digit* r)
Denys Vlasenko87e3f2e2021-10-06 19:59:39 +0200434{
435 uint64_t reg;
436 uint64_t ooff;
437//p256_mod[3..0] = ffffffff00000001 0000000000000000 00000000ffffffff ffffffffffffffff
438 asm volatile (
Denys Vlasenko5e9c6172021-10-06 20:14:49 +0200439"\n addq $1, (%0)" // adding 1 is the same as subtracting ffffffffffffffff
Denys Vlasenko87e3f2e2021-10-06 19:59:39 +0200440"\n cmc" // only carry bit needs inverting
Denys Vlasenko17e6fb02021-10-06 21:22:36 +0200441"\n"
Denys Vlasenko5e9c6172021-10-06 20:14:49 +0200442"\n sbbq %1, 1*8(%0)" // %1 holds 00000000ffffffff
Denys Vlasenko17e6fb02021-10-06 21:22:36 +0200443"\n"
Denys Vlasenko5e9c6172021-10-06 20:14:49 +0200444"\n sbbq $0, 2*8(%0)"
Denys Vlasenko87e3f2e2021-10-06 19:59:39 +0200445"\n"
Denys Vlasenko5e9c6172021-10-06 20:14:49 +0200446"\n movq 3*8(%0), %2"
447"\n sbbq $0, %2" // adding 00000000ffffffff (in %1)
448"\n addq %1, %2" // is the same as subtracting ffffffff00000001
449"\n movq %2, 3*8(%0)"
Denys Vlasenko87e3f2e2021-10-06 19:59:39 +0200450"\n"
Denys Vlasenko5e9c6172021-10-06 20:14:49 +0200451 : "=r" (r), "=r" (ooff), "=r" (reg)
452 : "0" (r), "1" (0x00000000ffffffff)
Denys Vlasenko87e3f2e2021-10-06 19:59:39 +0200453 : "memory"
454 );
455}
Denys Vlasenko567eefc2021-10-06 14:25:10 +0200456#else
Denys Vlasenko5e9c6172021-10-06 20:14:49 +0200457static void sp_256_sub_8_p256_mod(sp_digit* r)
458{
459 sp_256_sub_8(r, r, p256_mod);
460}
Denys Vlasenko567eefc2021-10-06 14:25:10 +0200461#endif
Denys Vlasenkoc7842842021-10-06 01:09:37 +0200462
Denys Vlasenko4415f7b2021-11-27 15:47:26 +0100463/* Multiply a and b into r. (r = a * b)
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +0100464 * r should be [16] array (512 bits), and must not coincide with a or b.
Denys Vlasenko4415f7b2021-11-27 15:47:26 +0100465 */
466static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200467{
Denys Vlasenkobbd723e2021-10-05 23:19:18 +0200468#if ALLOW_ASM && defined(__GNUC__) && defined(__i386__)
Denys Vlasenkobbd723e2021-10-05 23:19:18 +0200469 int k;
470 uint32_t accl;
471 uint32_t acch;
472
473 acch = accl = 0;
474 for (k = 0; k < 15; k++) {
475 int i, j;
476 uint32_t acc_hi;
477 i = k - 7;
478 if (i < 0)
479 i = 0;
480 j = k - i;
481 acc_hi = 0;
482 do {
483////////////////////////
484// uint64_t m = ((uint64_t)a[i]) * b[j];
485// acc_hi:acch:accl += m;
486 asm volatile (
487 // a[i] is already loaded in %%eax
488"\n mull %7"
489"\n addl %%eax, %0"
490"\n adcl %%edx, %1"
491"\n adcl $0, %2"
492 : "=rm" (accl), "=rm" (acch), "=rm" (acc_hi)
493 : "0" (accl), "1" (acch), "2" (acc_hi), "a" (a[i]), "m" (b[j])
494 : "cc", "dx"
495 );
496////////////////////////
497 j--;
498 i++;
499 } while (i != 8 && i <= k);
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +0100500 r[k] = accl;
Denys Vlasenkobbd723e2021-10-05 23:19:18 +0200501 accl = acch;
502 acch = acc_hi;
503 }
504 r[15] = accl;
Denys Vlasenko911344a2021-10-06 17:17:34 +0200505#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
Denys Vlasenko911344a2021-10-06 17:17:34 +0200506 const uint64_t* aa = (const void*)a;
507 const uint64_t* bb = (const void*)b;
Denys Vlasenko0b13ab62021-11-27 19:36:23 +0100508 uint64_t* rr = (void*)r;
Denys Vlasenko911344a2021-10-06 17:17:34 +0200509 int k;
510 uint64_t accl;
511 uint64_t acch;
512
513 acch = accl = 0;
514 for (k = 0; k < 7; k++) {
515 int i, j;
516 uint64_t acc_hi;
517 i = k - 3;
518 if (i < 0)
519 i = 0;
520 j = k - i;
521 acc_hi = 0;
522 do {
523////////////////////////
524// uint128_t m = ((uint128_t)a[i]) * b[j];
525// acc_hi:acch:accl += m;
526 asm volatile (
527 // aa[i] is already loaded in %%rax
528"\n mulq %7"
529"\n addq %%rax, %0"
530"\n adcq %%rdx, %1"
531"\n adcq $0, %2"
532 : "=rm" (accl), "=rm" (acch), "=rm" (acc_hi)
533 : "0" (accl), "1" (acch), "2" (acc_hi), "a" (aa[i]), "m" (bb[j])
534 : "cc", "dx"
535 );
536////////////////////////
Denys Vlasenko17e6fb02021-10-06 21:22:36 +0200537 j--;
Denys Vlasenko911344a2021-10-06 17:17:34 +0200538 i++;
539 } while (i != 4 && i <= k);
540 rr[k] = accl;
541 accl = acch;
542 acch = acc_hi;
543 }
544 rr[7] = accl;
Denys Vlasenkobbd723e2021-10-05 23:19:18 +0200545#elif 0
546 //TODO: arm assembly (untested)
Denys Vlasenkobbd723e2021-10-05 23:19:18 +0200547 asm volatile (
548"\n mov r5, #0"
549"\n mov r6, #0"
550"\n mov r7, #0"
551"\n mov r8, #0"
552"\n 1:"
553"\n subs r3, r5, #28"
554"\n movcc r3, #0"
555"\n sub r4, r5, r3"
Denys Vlasenko22fd8fd2021-10-06 16:10:49 +0200556"\n 2:"
Denys Vlasenkobbd723e2021-10-05 23:19:18 +0200557"\n ldr r14, [%[a], r3]"
558"\n ldr r12, [%[b], r4]"
559"\n umull r9, r10, r14, r12"
560"\n adds r6, r6, r9"
561"\n adcs r7, r7, r10"
562"\n adc r8, r8, #0"
563"\n add r3, r3, #4"
564"\n sub r4, r4, #4"
565"\n cmp r3, #32"
566"\n beq 3f"
567"\n cmp r3, r5"
568"\n ble 2b"
569"\n 3:"
570"\n str r6, [%[r], r5]"
571"\n mov r6, r7"
572"\n mov r7, r8"
573"\n mov r8, #0"
574"\n add r5, r5, #4"
575"\n cmp r5, #56"
576"\n ble 1b"
577"\n str r6, [%[r], r5]"
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +0100578 : [r] "r" (r), [a] "r" (a), [b] "r" (b)
Denys Vlasenko22fd8fd2021-10-06 16:10:49 +0200579 : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "r14"
Denys Vlasenkobbd723e2021-10-05 23:19:18 +0200580 );
Denys Vlasenkobbd723e2021-10-05 23:19:18 +0200581#else
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200582 int i, j, k;
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200583 uint64_t acc;
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200584
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200585 acc = 0;
586 for (k = 0; k < 15; k++) {
587 uint32_t acc_hi;
588 i = k - 7;
589 if (i < 0)
590 i = 0;
591 j = k - i;
592 acc_hi = 0;
Denys Vlasenkobbd723e2021-10-05 23:19:18 +0200593 do {
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200594 uint64_t m = ((uint64_t)a[i]) * b[j];
595 acc += m;
596 if (acc < m)
597 acc_hi++;
598 j--;
599 i++;
Denys Vlasenkobbd723e2021-10-05 23:19:18 +0200600 } while (i != 8 && i <= k);
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +0100601 r[k] = acc;
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200602 acc = (acc >> 32) | ((uint64_t)acc_hi << 32);
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200603 }
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200604 r[15] = acc;
Denys Vlasenkobbd723e2021-10-05 23:19:18 +0200605#endif
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200606}
607
Denys Vlasenko389329e2021-10-05 13:39:33 +0200608/* Shift number right one bit. Bottom bit is lost. */
Denys Vlasenko4bc9da12021-11-27 11:28:11 +0100609#if UNALIGNED_LE_64BIT
610static void sp_256_rshift1_8(sp_digit* rr, uint64_t carry)
Denys Vlasenkoe7305052021-10-05 13:30:48 +0200611{
Denys Vlasenko4bc9da12021-11-27 11:28:11 +0100612 uint64_t *r = (void*)rr;
Denys Vlasenkoe7305052021-10-05 13:30:48 +0200613 int i;
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200614
Denys Vlasenko4bc9da12021-11-27 11:28:11 +0100615 carry = (((uint64_t)!!carry) << 63);
616 for (i = 3; i >= 0; i--) {
617 uint64_t c = r[i] << 63;
618 r[i] = (r[i] >> 1) | carry;
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200619 carry = c;
620 }
Denys Vlasenkoe7305052021-10-05 13:30:48 +0200621}
Denys Vlasenko4bc9da12021-11-27 11:28:11 +0100622#else
623static void sp_256_rshift1_8(sp_digit* r, sp_digit carry)
624{
625 int i;
626
627 carry = (((sp_digit)!!carry) << 31);
628 for (i = 7; i >= 0; i--) {
629 sp_digit c = r[i] << 31;
630 r[i] = (r[i] >> 1) | carry;
631 carry = c;
632 }
633}
634#endif
Denys Vlasenkoe7305052021-10-05 13:30:48 +0200635
Denys Vlasenkodcfd8d32021-11-27 16:07:42 +0100636/* Divide the number by 2 mod the modulus (prime). (r = (r / 2) % m) */
637static void sp_256_div2_8(sp_digit* r /*, const sp_digit* m*/)
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200638{
Denys Vlasenkodcfd8d32021-11-27 16:07:42 +0100639 const sp_digit* m = p256_mod;
640
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200641 int carry = 0;
Denys Vlasenkodcfd8d32021-11-27 16:07:42 +0100642 if (r[0] & 1)
643 carry = sp_256_add_8(r, r, m);
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200644 sp_256_norm_8(r);
Denys Vlasenko4bc9da12021-11-27 11:28:11 +0100645 sp_256_rshift1_8(r, carry);
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200646}
647
648/* Add two Montgomery form numbers (r = a + b % m) */
Denys Vlasenkoc7842842021-10-06 01:09:37 +0200649static void sp_256_mont_add_8(sp_digit* r, const sp_digit* a, const sp_digit* b
650 /*, const sp_digit* m*/)
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200651{
Denys Vlasenkoc7842842021-10-06 01:09:37 +0200652// const sp_digit* m = p256_mod;
653
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200654 int carry = sp_256_add_8(r, a, b);
655 sp_256_norm_8(r);
656 if (carry) {
Denys Vlasenko5e9c6172021-10-06 20:14:49 +0200657 sp_256_sub_8_p256_mod(r);
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200658 sp_256_norm_8(r);
Denys Vlasenko55578f22021-10-05 19:45:56 +0200659 }
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200660}
661
662/* Subtract two Montgomery form numbers (r = a - b % m) */
Denys Vlasenkoc7842842021-10-06 01:09:37 +0200663static void sp_256_mont_sub_8(sp_digit* r, const sp_digit* a, const sp_digit* b
664 /*, const sp_digit* m*/)
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200665{
Denys Vlasenkoc7842842021-10-06 01:09:37 +0200666 const sp_digit* m = p256_mod;
667
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200668 int borrow;
669 borrow = sp_256_sub_8(r, a, b);
670 sp_256_norm_8(r);
671 if (borrow) {
672 sp_256_add_8(r, r, m);
673 sp_256_norm_8(r);
Denys Vlasenko55578f22021-10-05 19:45:56 +0200674 }
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200675}
676
677/* Double a Montgomery form number (r = a + a % m) */
Denys Vlasenkoc7842842021-10-06 01:09:37 +0200678static void sp_256_mont_dbl_8(sp_digit* r, const sp_digit* a /*, const sp_digit* m*/)
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200679{
Denys Vlasenkoc7842842021-10-06 01:09:37 +0200680// const sp_digit* m = p256_mod;
681
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200682 int carry = sp_256_add_8(r, a, a);
683 sp_256_norm_8(r);
684 if (carry)
Denys Vlasenko5e9c6172021-10-06 20:14:49 +0200685 sp_256_sub_8_p256_mod(r);
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200686 sp_256_norm_8(r);
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200687}
688
689/* Triple a Montgomery form number (r = a + a + a % m) */
Denys Vlasenkoc7842842021-10-06 01:09:37 +0200690static void sp_256_mont_tpl_8(sp_digit* r, const sp_digit* a /*, const sp_digit* m*/)
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200691{
Denys Vlasenkoc7842842021-10-06 01:09:37 +0200692// const sp_digit* m = p256_mod;
693
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200694 int carry = sp_256_add_8(r, a, a);
695 sp_256_norm_8(r);
696 if (carry) {
Denys Vlasenko5e9c6172021-10-06 20:14:49 +0200697 sp_256_sub_8_p256_mod(r);
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200698 sp_256_norm_8(r);
Denys Vlasenko55578f22021-10-05 19:45:56 +0200699 }
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200700 carry = sp_256_add_8(r, r, a);
701 sp_256_norm_8(r);
702 if (carry) {
Denys Vlasenko5e9c6172021-10-06 20:14:49 +0200703 sp_256_sub_8_p256_mod(r);
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200704 sp_256_norm_8(r);
Denys Vlasenko55578f22021-10-05 19:45:56 +0200705 }
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200706}
707
Denys Vlasenko4415f7b2021-11-27 15:47:26 +0100708/* Shift the result in the high 256 bits down to the bottom.
Denys Vlasenko4415f7b2021-11-27 15:47:26 +0100709 */
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +0100710static void sp_512to256_mont_shift_8(sp_digit* r, sp_digit* a)
Denys Vlasenko4bc9da12021-11-27 11:28:11 +0100711{
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +0100712 memcpy(r, a + 8, sizeof(*r) * 8);
Denys Vlasenko4bc9da12021-11-27 11:28:11 +0100713}
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200714
Denys Vlasenko4415f7b2021-11-27 15:47:26 +0100715/* Mul a by scalar b and add into r. (r += a * b)
716 * a = p256_mod
717 * b = r[0]
718 */
Denys Vlasenko2430fcf2021-10-06 00:19:30 +0200719static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
Denys Vlasenkoe7305052021-10-05 13:30:48 +0200720{
Denys Vlasenko2430fcf2021-10-06 00:19:30 +0200721// const sp_digit* a = p256_mod;
722//a[7..0] = ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff
723 sp_digit b = r[0];
Denys Vlasenkoe7305052021-10-05 13:30:48 +0200724
Denys Vlasenko00f2cce2021-10-06 10:15:29 +0200725 uint64_t t;
726
727// t = 0;
Denys Vlasenko2430fcf2021-10-06 00:19:30 +0200728// for (i = 0; i < 8; i++) {
729// uint32_t t_hi;
730// uint64_t m = ((uint64_t)b * a[i]) + r[i];
731// t += m;
732// t_hi = (t < m);
733// r[i] = (sp_digit)t;
734// t = (t >> 32) | ((uint64_t)t_hi << 32);
735// }
736// r[8] += (sp_digit)t;
737
738 // Unroll, then optimize the above loop:
739 //uint32_t t_hi;
740 uint64_t m;
Denys Vlasenko00f2cce2021-10-06 10:15:29 +0200741 uint32_t t32;
Denys Vlasenko2430fcf2021-10-06 00:19:30 +0200742
743 //m = ((uint64_t)b * a[0]) + r[0];
744 // Since b is r[0] and a[0] is ffffffff, the above optimizes to:
745 // m = r[0] * ffffffff + r[0] = (r[0] * 100000000 - r[0]) + r[0] = r[0] << 32;
746 //t += m;
Denys Vlasenko00f2cce2021-10-06 10:15:29 +0200747 // t = r[0] << 32 = b << 32;
Denys Vlasenko2430fcf2021-10-06 00:19:30 +0200748 //t_hi = (t < m);
749 // t_hi = 0;
750 //r[0] = (sp_digit)t;
751 r[0] = 0;
752 //t = (t >> 32) | ((uint64_t)t_hi << 32);
753 // t = b;
754
755 //m = ((uint64_t)b * a[1]) + r[1];
756 // Since a[1] is ffffffff, the above optimizes to:
757 // m = b * ffffffff + r[1] = (b * 100000000 - b) + r[1] = (b << 32) - b + r[1];
758 //t += m;
759 // t = b + (b << 32) - b + r[1] = (b << 32) + r[1];
760 //t_hi = (t < m);
761 // t_hi = 0;
762 //r[1] = (sp_digit)t;
763 // r[1] = r[1];
764 //t = (t >> 32) | ((uint64_t)t_hi << 32);
765 // t = b;
766
767 //m = ((uint64_t)b * a[2]) + r[2];
768 // Since a[2] is ffffffff, the above optimizes to:
769 // m = b * ffffffff + r[2] = (b * 100000000 - b) + r[2] = (b << 32) - b + r[2];
770 //t += m;
771 // t = b + (b << 32) - b + r[2] = (b << 32) + r[2]
772 //t_hi = (t < m);
773 // t_hi = 0;
774 //r[2] = (sp_digit)t;
775 // r[2] = r[2];
776 //t = (t >> 32) | ((uint64_t)t_hi << 32);
777 // t = b;
778
779 //m = ((uint64_t)b * a[3]) + r[3];
780 // Since a[3] is 00000000, the above optimizes to:
781 // m = b * 0 + r[3] = r[3];
782 //t += m;
Denys Vlasenko00f2cce2021-10-06 10:15:29 +0200783 // t = b + r[3];
Denys Vlasenko2430fcf2021-10-06 00:19:30 +0200784 //t_hi = (t < m);
785 // t_hi = 0;
786 //r[3] = (sp_digit)t;
787 r[3] = r[3] + b;
788 //t = (t >> 32) | ((uint64_t)t_hi << 32);
Denys Vlasenko00f2cce2021-10-06 10:15:29 +0200789 t32 = (r[3] < b); // 0 or 1
Denys Vlasenko2430fcf2021-10-06 00:19:30 +0200790
791 //m = ((uint64_t)b * a[4]) + r[4];
792 // Since a[4] is 00000000, the above optimizes to:
793 // m = b * 0 + r[4] = r[4];
794 //t += m;
Denys Vlasenko00f2cce2021-10-06 10:15:29 +0200795 // t = t32 + r[4];
Denys Vlasenko2430fcf2021-10-06 00:19:30 +0200796 //t_hi = (t < m);
797 // t_hi = 0;
Denys Vlasenko00f2cce2021-10-06 10:15:29 +0200798 //r[4] = (sp_digit)t;
Denys Vlasenko2430fcf2021-10-06 00:19:30 +0200799 //t = (t >> 32) | ((uint64_t)t_hi << 32);
Denys Vlasenko00f2cce2021-10-06 10:15:29 +0200800 if (t32 != 0) {
801 r[4]++;
802 t32 = (r[4] == 0); // 0 or 1
Denys Vlasenko2430fcf2021-10-06 00:19:30 +0200803
804 //m = ((uint64_t)b * a[5]) + r[5];
805 // Since a[5] is 00000000, the above optimizes to:
806 // m = b * 0 + r[5] = r[5];
807 //t += m;
Denys Vlasenko00f2cce2021-10-06 10:15:29 +0200808 // t = t32 + r[5]; (t32 is 0 or 1)
Denys Vlasenko2430fcf2021-10-06 00:19:30 +0200809 //t_hi = (t < m);
810 // t_hi = 0;
Denys Vlasenko00f2cce2021-10-06 10:15:29 +0200811 //r[5] = (sp_digit)t;
Denys Vlasenko2430fcf2021-10-06 00:19:30 +0200812 //t = (t >> 32) | ((uint64_t)t_hi << 32);
Denys Vlasenko00f2cce2021-10-06 10:15:29 +0200813 if (t32 != 0) {
814 r[5]++;
815 t32 = (r[5] == 0); // 0 or 1
816 }
817 }
Denys Vlasenko2430fcf2021-10-06 00:19:30 +0200818
819 //m = ((uint64_t)b * a[6]) + r[6];
820 // Since a[6] is 00000001, the above optimizes to:
Denys Vlasenko00f2cce2021-10-06 10:15:29 +0200821 // m = (uint64_t)b + r[6]; // 33 bits at most
822 //t += m;
823 t = t32 + (uint64_t)b + r[6];
Denys Vlasenko2430fcf2021-10-06 00:19:30 +0200824 //t_hi = (t < m);
Denys Vlasenko00f2cce2021-10-06 10:15:29 +0200825 // t_hi = 0;
Denys Vlasenko2430fcf2021-10-06 00:19:30 +0200826 r[6] = (sp_digit)t;
827 //t = (t >> 32) | ((uint64_t)t_hi << 32);
828 t = (t >> 32);
829
830 //m = ((uint64_t)b * a[7]) + r[7];
831 // Since a[7] is ffffffff, the above optimizes to:
832 // m = b * ffffffff + r[7] = (b * 100000000 - b) + r[7]
833 m = ((uint64_t)b << 32) - b + r[7];
834 t += m;
835 //t_hi = (t < m);
Denys Vlasenko00f2cce2021-10-06 10:15:29 +0200836 // t_hi in fact is always 0 here (256bit * 32bit can't have more than 32 bits of overflow)
Denys Vlasenko2430fcf2021-10-06 00:19:30 +0200837 r[7] = (sp_digit)t;
838 //t = (t >> 32) | ((uint64_t)t_hi << 32);
839 t = (t >> 32);
840
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200841 r[8] += (sp_digit)t;
842 return (r[8] < (sp_digit)t); /* 1 if addition overflowed */
Denys Vlasenkoe7305052021-10-05 13:30:48 +0200843}
844
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200845/* Reduce the number back to 256 bits using Montgomery reduction.
Denys Vlasenko9c671fe2021-11-27 18:42:27 +0100846 * Note: the result is NOT guaranteed to be less than p256_mod!
847 * (it is only guaranteed to fit into 256 bits).
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200848 *
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +0100849 * r Result.
850 * a Double-wide number to reduce. Clobbered.
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200851 * m The single precision number representing the modulus.
852 * mp The digit representing the negative inverse of m mod 2^n.
853 */
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +0100854static void sp_512to256_mont_reduce_8(sp_digit* r, sp_digit* a/*, const sp_digit* m, sp_digit mp*/)
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200855{
Denys Vlasenkoc7842842021-10-06 01:09:37 +0200856// const sp_digit* m = p256_mod;
Denys Vlasenko389329e2021-10-05 13:39:33 +0200857 sp_digit mp = p256_mp_mod;
858
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200859 int i;
Denys Vlasenko2430fcf2021-10-06 00:19:30 +0200860// sp_digit mu;
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200861
862 if (mp != 1) {
Denys Vlasenko2430fcf2021-10-06 00:19:30 +0200863 sp_digit word16th = 0;
864 for (i = 0; i < 8; i++) {
865// mu = (sp_digit)(a[i] * mp);
866 if (sp_256_mul_add_8(a+i /*, m, mu*/)) {
867 int j = i + 8;
868 inc_next_word0:
869 if (++j > 15) { /* a[16] array has no more words? */
870 word16th++;
871 continue;
872 }
873 if (++a[j] == 0) /* did this overflow too? */
874 goto inc_next_word0;
875 }
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200876 }
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +0100877 sp_512to256_mont_shift_8(r, a);
Denys Vlasenko2430fcf2021-10-06 00:19:30 +0200878 if (word16th != 0)
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +0100879 sp_256_sub_8_p256_mod(r);
880 sp_256_norm_8(r);
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200881 }
Denys Vlasenko389329e2021-10-05 13:39:33 +0200882 else { /* Same code for explicit mp == 1 (which is always the case for P256) */
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200883 sp_digit word16th = 0;
884 for (i = 0; i < 8; i++) {
Denys Vlasenko4415f7b2021-11-27 15:47:26 +0100885// mu = a[i];
Denys Vlasenko2430fcf2021-10-06 00:19:30 +0200886 if (sp_256_mul_add_8(a+i /*, m, mu*/)) {
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200887 int j = i + 8;
888 inc_next_word:
889 if (++j > 15) { /* a[16] array has no more words? */
890 word16th++;
891 continue;
892 }
893 if (++a[j] == 0) /* did this overflow too? */
894 goto inc_next_word;
895 }
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200896 }
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +0100897 sp_512to256_mont_shift_8(r, a);
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200898 if (word16th != 0)
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +0100899 sp_256_sub_8_p256_mod(r);
900 sp_256_norm_8(r);
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200901 }
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200902}
903
904/* Multiply two Montogmery form numbers mod the modulus (prime).
905 * (r = a * b mod m)
906 *
907 * r Result of multiplication.
908 * a First number to multiply in Montogmery form.
909 * b Second number to multiply in Montogmery form.
910 * m Modulus (prime).
Denys Vlasenko1b93c7c2021-11-28 02:56:02 +0100911 * mp Montogmery multiplier.
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200912 */
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +0100913static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b
Denys Vlasenko389329e2021-10-05 13:39:33 +0200914 /*, const sp_digit* m, sp_digit mp*/)
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200915{
Denys Vlasenko389329e2021-10-05 13:39:33 +0200916 //const sp_digit* m = p256_mod;
917 //sp_digit mp = p256_mp_mod;
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +0100918 sp_digit t[2 * 8];
919 sp_256to512_mul_8(t, a, b);
920 sp_512to256_mont_reduce_8(r, t /*, m, mp*/);
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200921}
922
923/* Square the Montgomery form number. (r = a * a mod m)
924 *
925 * r Result of squaring.
926 * a Number to square in Montogmery form.
927 * m Modulus (prime).
Denys Vlasenko1b93c7c2021-11-28 02:56:02 +0100928 * mp Montogmery multiplier.
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200929 */
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +0100930static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a
Denys Vlasenko389329e2021-10-05 13:39:33 +0200931 /*, const sp_digit* m, sp_digit mp*/)
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200932{
Denys Vlasenko389329e2021-10-05 13:39:33 +0200933 //const sp_digit* m = p256_mod;
934 //sp_digit mp = p256_mp_mod;
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +0100935 sp_256_mont_mul_8(r, a, a /*, m, mp*/);
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200936}
937
938/* Invert the number, in Montgomery form, modulo the modulus (prime) of the
939 * P256 curve. (r = 1 / a mod m)
940 *
Denys Vlasenkocfb61572021-11-28 11:10:00 +0100941 * r Inverse result. Must not coincide with a.
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200942 * a Number to invert.
943 */
944#if 0
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +0100945//p256_mod - 2:
946//ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff - 2
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200947//Bit pattern:
948//2 2 2 2 2 2 2 1...1
949//5 5 4 3 2 1 0 9...0 9...1
950//543210987654321098765432109876543210987654321098765432109876543210...09876543210...09876543210
951//111111111111111111111111111111110000000000000000000000000000000100...00000111111...11111111101
952#endif
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200953static void sp_256_mont_inv_8(sp_digit* r, sp_digit* a)
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200954{
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200955 int i;
956
Denys Vlasenkocfb61572021-11-28 11:10:00 +0100957 memcpy(r, a, sizeof(sp_digit) * 8);
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200958 for (i = 254; i >= 0; i--) {
Denys Vlasenkocfb61572021-11-28 11:10:00 +0100959 sp_256_mont_sqr_8(r, r /*, p256_mod, p256_mp_mod*/);
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200960 /*if (p256_mod_2[i / 32] & ((sp_digit)1 << (i % 32)))*/
961 if (i >= 224 || i == 192 || (i <= 95 && i != 1))
Denys Vlasenkocfb61572021-11-28 11:10:00 +0100962 sp_256_mont_mul_8(r, r, a /*, p256_mod, p256_mp_mod*/);
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200963 }
Denys Vlasenkoa2bc52d2021-04-27 01:21:26 +0200964}
965
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200966/* Multiply a number by Montogmery normalizer mod modulus (prime).
967 *
968 * r The resulting Montgomery form number.
969 * a The number to convert.
970 */
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200971static void sp_256_mod_mul_norm_8(sp_digit* r, const sp_digit* a)
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200972{
Denys Vlasenko12040122021-04-26 20:24:34 +0200973 int64_t t[8];
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200974 int32_t o;
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200975
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200976#define A(n) ((uint64_t)a[n])
Denys Vlasenko12040122021-04-26 20:24:34 +0200977 /* 1 1 0 -1 -1 -1 -1 0 */
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200978 t[0] = 0 + A(0) + A(1) - A(3) - A(4) - A(5) - A(6);
Denys Vlasenko12040122021-04-26 20:24:34 +0200979 /* 0 1 1 0 -1 -1 -1 -1 */
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200980 t[1] = 0 + A(1) + A(2) - A(4) - A(5) - A(6) - A(7);
Denys Vlasenko12040122021-04-26 20:24:34 +0200981 /* 0 0 1 1 0 -1 -1 -1 */
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200982 t[2] = 0 + A(2) + A(3) - A(5) - A(6) - A(7);
Denys Vlasenko12040122021-04-26 20:24:34 +0200983 /* -1 -1 0 2 2 1 0 -1 */
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200984 t[3] = 0 - A(0) - A(1) + 2 * A(3) + 2 * A(4) + A(5) - A(7);
Denys Vlasenko12040122021-04-26 20:24:34 +0200985 /* 0 -1 -1 0 2 2 1 0 */
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200986 t[4] = 0 - A(1) - A(2) + 2 * A(4) + 2 * A(5) + A(6);
Denys Vlasenko12040122021-04-26 20:24:34 +0200987 /* 0 0 -1 -1 0 2 2 1 */
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200988 t[5] = 0 - A(2) - A(3) + 2 * A(5) + 2 * A(6) + A(7);
Denys Vlasenko12040122021-04-26 20:24:34 +0200989 /* -1 -1 0 0 0 1 3 2 */
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200990 t[6] = 0 - A(0) - A(1) + A(5) + 3 * A(6) + 2 * A(7);
Denys Vlasenko12040122021-04-26 20:24:34 +0200991 /* 1 0 -1 -1 -1 -1 0 3 */
Denys Vlasenko3b411eb2021-10-05 20:00:50 +0200992 t[7] = 0 + A(0) - A(2) - A(3) - A(4) - A(5) + 3 * A(7);
993#undef A
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +0200994
Denys Vlasenko12040122021-04-26 20:24:34 +0200995 t[1] += t[0] >> 32; t[0] &= 0xffffffff;
996 t[2] += t[1] >> 32; t[1] &= 0xffffffff;
997 t[3] += t[2] >> 32; t[2] &= 0xffffffff;
998 t[4] += t[3] >> 32; t[3] &= 0xffffffff;
999 t[5] += t[4] >> 32; t[4] &= 0xffffffff;
1000 t[6] += t[5] >> 32; t[5] &= 0xffffffff;
1001 t[7] += t[6] >> 32; t[6] &= 0xffffffff;
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001002 o = t[7] >> 32; //t[7] &= 0xffffffff;
Denys Vlasenko12040122021-04-26 20:24:34 +02001003 t[0] += o;
1004 t[3] -= o;
1005 t[6] -= o;
1006 t[7] += o;
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001007 r[0] = (sp_digit)t[0];
1008 t[1] += t[0] >> 32;
1009 r[1] = (sp_digit)t[1];
1010 t[2] += t[1] >> 32;
1011 r[2] = (sp_digit)t[2];
1012 t[3] += t[2] >> 32;
1013 r[3] = (sp_digit)t[3];
1014 t[4] += t[3] >> 32;
1015 r[4] = (sp_digit)t[4];
1016 t[5] += t[4] >> 32;
1017 r[5] = (sp_digit)t[5];
1018 t[6] += t[5] >> 32;
1019 r[6] = (sp_digit)t[6];
1020// t[7] += t[6] >> 32;
1021// r[7] = (sp_digit)t[7];
1022 r[7] = (sp_digit)t[7] + (sp_digit)(t[6] >> 32);
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001023}
1024
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001025/* Map the Montgomery form projective co-ordinate point to an affine point.
1026 *
1027 * r Resulting affine co-ordinate point.
1028 * p Montgomery form projective co-ordinate point.
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001029 */
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001030static void sp_256_map_8(sp_point* r, sp_point* p)
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001031{
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +01001032 sp_digit t1[8];
1033 sp_digit t2[8];
1034 sp_digit rr[2 * 8];
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001035
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001036 sp_256_mont_inv_8(t1, p->z);
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001037
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +01001038 sp_256_mont_sqr_8(t2, t1 /*, p256_mod, p256_mp_mod*/);
1039 sp_256_mont_mul_8(t1, t2, t1 /*, p256_mod, p256_mp_mod*/);
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001040
Denys Vlasenko12040122021-04-26 20:24:34 +02001041 /* x /= z^2 */
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +01001042 sp_256_mont_mul_8(rr, p->x, t2 /*, p256_mod, p256_mp_mod*/);
1043 memset(rr + 8, 0, sizeof(rr) / 2);
1044 sp_512to256_mont_reduce_8(r->x, rr /*, p256_mod, p256_mp_mod*/);
Denys Vlasenko12040122021-04-26 20:24:34 +02001045 /* Reduce x to less than modulus */
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001046 if (sp_256_cmp_8(r->x, p256_mod) >= 0)
Denys Vlasenko5e9c6172021-10-06 20:14:49 +02001047 sp_256_sub_8_p256_mod(r->x);
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001048 sp_256_norm_8(r->x);
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001049
Denys Vlasenko12040122021-04-26 20:24:34 +02001050 /* y /= z^3 */
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +01001051 sp_256_mont_mul_8(rr, p->y, t1 /*, p256_mod, p256_mp_mod*/);
1052 memset(rr + 8, 0, sizeof(rr) / 2);
1053 sp_512to256_mont_reduce_8(r->y, rr /*, p256_mod, p256_mp_mod*/);
Denys Vlasenko12040122021-04-26 20:24:34 +02001054 /* Reduce y to less than modulus */
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001055 if (sp_256_cmp_8(r->y, p256_mod) >= 0)
Denys Vlasenko5e9c6172021-10-06 20:14:49 +02001056 sp_256_sub_8_p256_mod(r->y);
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001057 sp_256_norm_8(r->y);
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001058
Denys Vlasenko12040122021-04-26 20:24:34 +02001059 memset(r->z, 0, sizeof(r->z));
1060 r->z[0] = 1;
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001061}
1062
1063/* Double the Montgomery form projective point p.
1064 *
1065 * r Result of doubling point.
1066 * p Point to double.
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001067 */
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001068static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p)
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001069{
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +01001070 sp_digit t1[8];
1071 sp_digit t2[8];
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001072
Denys Vlasenko12040122021-04-26 20:24:34 +02001073 /* Put point to double into result */
1074 if (r != p)
1075 *r = *p; /* struct copy */
Denys Vlasenko4d3a5c12021-04-26 15:21:38 +02001076
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001077 if (r->infinity)
Denys Vlasenkoe7305052021-10-05 13:30:48 +02001078 return;
1079
Denys Vlasenko12040122021-04-26 20:24:34 +02001080 /* T1 = Z * Z */
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +01001081 sp_256_mont_sqr_8(t1, r->z /*, p256_mod, p256_mp_mod*/);
Denys Vlasenko12040122021-04-26 20:24:34 +02001082 /* Z = Y * Z */
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +01001083 sp_256_mont_mul_8(r->z, r->y, r->z /*, p256_mod, p256_mp_mod*/);
Denys Vlasenko12040122021-04-26 20:24:34 +02001084 /* Z = 2Z */
Denys Vlasenkoc7842842021-10-06 01:09:37 +02001085 sp_256_mont_dbl_8(r->z, r->z /*, p256_mod*/);
Denys Vlasenko12040122021-04-26 20:24:34 +02001086 /* T2 = X - T1 */
Denys Vlasenkoc7842842021-10-06 01:09:37 +02001087 sp_256_mont_sub_8(t2, r->x, t1 /*, p256_mod*/);
Denys Vlasenko12040122021-04-26 20:24:34 +02001088 /* T1 = X + T1 */
Denys Vlasenkoc7842842021-10-06 01:09:37 +02001089 sp_256_mont_add_8(t1, r->x, t1 /*, p256_mod*/);
Denys Vlasenko12040122021-04-26 20:24:34 +02001090 /* T2 = T1 * T2 */
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +01001091 sp_256_mont_mul_8(t2, t1, t2 /*, p256_mod, p256_mp_mod*/);
Denys Vlasenko12040122021-04-26 20:24:34 +02001092 /* T1 = 3T2 */
Denys Vlasenkoc7842842021-10-06 01:09:37 +02001093 sp_256_mont_tpl_8(t1, t2 /*, p256_mod*/);
Denys Vlasenko12040122021-04-26 20:24:34 +02001094 /* Y = 2Y */
Denys Vlasenkoc7842842021-10-06 01:09:37 +02001095 sp_256_mont_dbl_8(r->y, r->y /*, p256_mod*/);
Denys Vlasenko12040122021-04-26 20:24:34 +02001096 /* Y = Y * Y */
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +01001097 sp_256_mont_sqr_8(r->y, r->y /*, p256_mod, p256_mp_mod*/);
Denys Vlasenko12040122021-04-26 20:24:34 +02001098 /* T2 = Y * Y */
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +01001099 sp_256_mont_sqr_8(t2, r->y /*, p256_mod, p256_mp_mod*/);
Denys Vlasenko12040122021-04-26 20:24:34 +02001100 /* T2 = T2/2 */
Denys Vlasenkodcfd8d32021-11-27 16:07:42 +01001101 sp_256_div2_8(t2 /*, p256_mod*/);
Denys Vlasenko12040122021-04-26 20:24:34 +02001102 /* Y = Y * X */
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +01001103 sp_256_mont_mul_8(r->y, r->y, r->x /*, p256_mod, p256_mp_mod*/);
Denys Vlasenko12040122021-04-26 20:24:34 +02001104 /* X = T1 * T1 */
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +01001105 sp_256_mont_mul_8(r->x, t1, t1 /*, p256_mod, p256_mp_mod*/);
Denys Vlasenko12040122021-04-26 20:24:34 +02001106 /* X = X - Y */
Denys Vlasenkoc7842842021-10-06 01:09:37 +02001107 sp_256_mont_sub_8(r->x, r->x, r->y /*, p256_mod*/);
Denys Vlasenko12040122021-04-26 20:24:34 +02001108 /* X = X - Y */
Denys Vlasenkoc7842842021-10-06 01:09:37 +02001109 sp_256_mont_sub_8(r->x, r->x, r->y /*, p256_mod*/);
Denys Vlasenko12040122021-04-26 20:24:34 +02001110 /* Y = Y - X */
Denys Vlasenkoc7842842021-10-06 01:09:37 +02001111 sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/);
Denys Vlasenko12040122021-04-26 20:24:34 +02001112 /* Y = Y * T1 */
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +01001113 sp_256_mont_mul_8(r->y, r->y, t1 /*, p256_mod, p256_mp_mod*/);
Denys Vlasenko12040122021-04-26 20:24:34 +02001114 /* Y = Y - T2 */
Denys Vlasenkoc7842842021-10-06 01:09:37 +02001115 sp_256_mont_sub_8(r->y, r->y, t2 /*, p256_mod*/);
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001116 dump_512("y2 %s\n", r->y);
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001117}
1118
1119/* Add two Montgomery form projective points.
1120 *
1121 * r Result of addition.
1122 * p Frist point to add.
1123 * q Second point to add.
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001124 */
Denys Vlasenko53b2fdc2021-10-10 13:50:53 +02001125static NOINLINE void sp_256_proj_point_add_8(sp_point* r, sp_point* p, sp_point* q)
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001126{
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +01001127 sp_digit t1[8];
1128 sp_digit t2[8];
1129 sp_digit t3[8];
1130 sp_digit t4[8];
1131 sp_digit t5[8];
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001132
Denys Vlasenko12040122021-04-26 20:24:34 +02001133 /* Ensure only the first point is the same as the result. */
1134 if (q == r) {
1135 sp_point* a = p;
1136 p = q;
1137 q = a;
1138 }
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001139
Denys Vlasenko12040122021-04-26 20:24:34 +02001140 /* Check double */
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001141 sp_256_sub_8(t1, p256_mod, q->y);
1142 sp_256_norm_8(t1);
1143 if (sp_256_cmp_equal_8(p->x, q->x)
1144 && sp_256_cmp_equal_8(p->z, q->z)
1145 && (sp_256_cmp_equal_8(p->y, q->y) || sp_256_cmp_equal_8(p->y, t1))
Denys Vlasenko12040122021-04-26 20:24:34 +02001146 ) {
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001147 sp_256_proj_point_dbl_8(r, p);
Denys Vlasenkobbda85c2021-11-27 15:06:57 +01001148 return;
Denys Vlasenko12040122021-04-26 20:24:34 +02001149 }
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001150
Denys Vlasenkobbda85c2021-11-27 15:06:57 +01001151 if (p->infinity || q->infinity) {
Denys Vlasenko12040122021-04-26 20:24:34 +02001152 *r = p->infinity ? *q : *p; /* struct copy */
Denys Vlasenkobbda85c2021-11-27 15:06:57 +01001153 return;
Denys Vlasenko12040122021-04-26 20:24:34 +02001154 }
Denys Vlasenkobbda85c2021-11-27 15:06:57 +01001155
1156 /* U1 = X1*Z2^2 */
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +01001157 sp_256_mont_sqr_8(t1, q->z /*, p256_mod, p256_mp_mod*/);
1158 sp_256_mont_mul_8(t3, t1, q->z /*, p256_mod, p256_mp_mod*/);
1159 sp_256_mont_mul_8(t1, t1, r->x /*, p256_mod, p256_mp_mod*/);
Denys Vlasenkobbda85c2021-11-27 15:06:57 +01001160 /* U2 = X2*Z1^2 */
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +01001161 sp_256_mont_sqr_8(t2, r->z /*, p256_mod, p256_mp_mod*/);
1162 sp_256_mont_mul_8(t4, t2, r->z /*, p256_mod, p256_mp_mod*/);
1163 sp_256_mont_mul_8(t2, t2, q->x /*, p256_mod, p256_mp_mod*/);
Denys Vlasenkobbda85c2021-11-27 15:06:57 +01001164 /* S1 = Y1*Z2^3 */
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +01001165 sp_256_mont_mul_8(t3, t3, r->y /*, p256_mod, p256_mp_mod*/);
Denys Vlasenkobbda85c2021-11-27 15:06:57 +01001166 /* S2 = Y2*Z1^3 */
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +01001167 sp_256_mont_mul_8(t4, t4, q->y /*, p256_mod, p256_mp_mod*/);
Denys Vlasenkobbda85c2021-11-27 15:06:57 +01001168 /* H = U2 - U1 */
1169 sp_256_mont_sub_8(t2, t2, t1 /*, p256_mod*/);
1170 /* R = S2 - S1 */
1171 sp_256_mont_sub_8(t4, t4, t3 /*, p256_mod*/);
1172 /* Z3 = H*Z1*Z2 */
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +01001173 sp_256_mont_mul_8(r->z, r->z, q->z /*, p256_mod, p256_mp_mod*/);
1174 sp_256_mont_mul_8(r->z, r->z, t2 /*, p256_mod, p256_mp_mod*/);
Denys Vlasenkobbda85c2021-11-27 15:06:57 +01001175 /* X3 = R^2 - H^3 - 2*U1*H^2 */
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +01001176 sp_256_mont_sqr_8(r->x, t4 /*, p256_mod, p256_mp_mod*/);
1177 sp_256_mont_sqr_8(t5, t2 /*, p256_mod, p256_mp_mod*/);
1178 sp_256_mont_mul_8(r->y, t1, t5 /*, p256_mod, p256_mp_mod*/);
1179 sp_256_mont_mul_8(t5, t5, t2 /*, p256_mod, p256_mp_mod*/);
Denys Vlasenkobbda85c2021-11-27 15:06:57 +01001180 sp_256_mont_sub_8(r->x, r->x, t5 /*, p256_mod*/);
1181 sp_256_mont_dbl_8(t1, r->y /*, p256_mod*/);
1182 sp_256_mont_sub_8(r->x, r->x, t1 /*, p256_mod*/);
1183 /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
1184 sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/);
Denys Vlasenkof92ae1d2021-11-27 19:15:43 +01001185 sp_256_mont_mul_8(r->y, r->y, t4 /*, p256_mod, p256_mp_mod*/);
1186 sp_256_mont_mul_8(t5, t5, t3 /*, p256_mod, p256_mp_mod*/);
Denys Vlasenkobbda85c2021-11-27 15:06:57 +01001187 sp_256_mont_sub_8(r->y, r->y, t5 /*, p256_mod*/);
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001188}
1189
1190/* Multiply the point by the scalar and return the result.
1191 * If map is true then convert result to affine co-ordinates.
1192 *
1193 * r Resulting point.
1194 * g Point to multiply.
1195 * k Scalar to multiply by.
Denys Vlasenko03ab2a92021-04-26 14:55:46 +02001196 * map Indicates whether to convert result to affine.
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001197 */
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001198static void sp_256_ecc_mulmod_8(sp_point* r, const sp_point* g, const sp_digit* k /*, int map*/)
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001199{
Denys Vlasenko12040122021-04-26 20:24:34 +02001200 enum { map = 1 }; /* we always convert result to affine coordinates */
1201 sp_point t[3];
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001202 sp_digit n = n; /* for compiler */
Denys Vlasenko12040122021-04-26 20:24:34 +02001203 int c, y;
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001204
Denys Vlasenko12040122021-04-26 20:24:34 +02001205 memset(t, 0, sizeof(t));
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001206
Denys Vlasenko12040122021-04-26 20:24:34 +02001207 /* t[0] = {0, 0, 1} * norm */
1208 t[0].infinity = 1;
1209 /* t[1] = {g->x, g->y, g->z} * norm */
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001210 sp_256_mod_mul_norm_8(t[1].x, g->x);
1211 sp_256_mod_mul_norm_8(t[1].y, g->y);
1212 sp_256_mod_mul_norm_8(t[1].z, g->z);
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001213
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001214 /* For every bit, starting from most significant... */
1215 k += 7;
1216 c = 256;
1217 for (;;) {
1218 if ((c & 0x1f) == 0) {
1219 if (c == 0)
Denys Vlasenko12040122021-04-26 20:24:34 +02001220 break;
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001221 n = *k--;
Denys Vlasenko12040122021-04-26 20:24:34 +02001222 }
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001223
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001224 y = (n >> 31);
1225 dbg("y:%d t[%d] = t[0]+t[1]\n", y, y^1);
1226 sp_256_proj_point_add_8(&t[y^1], &t[0], &t[1]);
1227 dump_512("t[0].x %s\n", t[0].x);
1228 dump_512("t[0].y %s\n", t[0].y);
1229 dump_512("t[0].z %s\n", t[0].z);
1230 dump_512("t[1].x %s\n", t[1].x);
1231 dump_512("t[1].y %s\n", t[1].y);
1232 dump_512("t[1].z %s\n", t[1].z);
1233 dbg("t[2] = t[%d]\n", y);
Denys Vlasenko26c85222021-11-27 15:00:14 +01001234 t[2] = t[y]; /* struct copy */
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001235 dbg("t[2] *= 2\n");
1236 sp_256_proj_point_dbl_8(&t[2], &t[2]);
1237 dump_512("t[2].x %s\n", t[2].x);
1238 dump_512("t[2].y %s\n", t[2].y);
1239 dump_512("t[2].z %s\n", t[2].z);
Denys Vlasenko26c85222021-11-27 15:00:14 +01001240 t[y] = t[2]; /* struct copy */
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001241
1242 n <<= 1;
1243 c--;
Denys Vlasenko12040122021-04-26 20:24:34 +02001244 }
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001245
Denys Vlasenko12040122021-04-26 20:24:34 +02001246 if (map)
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001247 sp_256_map_8(r, &t[0]);
Denys Vlasenko12040122021-04-26 20:24:34 +02001248 else
Denys Vlasenko9c671fe2021-11-27 18:42:27 +01001249 *r = t[0]; /* struct copy */
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001250
Denys Vlasenko12040122021-04-26 20:24:34 +02001251 memset(t, 0, sizeof(t)); //paranoia
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001252}
1253
1254/* Multiply the base point of P256 by the scalar and return the result.
1255 * If map is true then convert result to affine co-ordinates.
1256 *
1257 * r Resulting point.
1258 * k Scalar to multiply by.
Denys Vlasenko03ab2a92021-04-26 14:55:46 +02001259 * map Indicates whether to convert result to affine.
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001260 */
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001261static void sp_256_ecc_mulmod_base_8(sp_point* r, sp_digit* k /*, int map*/)
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001262{
Denys Vlasenko39a3ef52021-04-27 01:31:51 +02001263 /* Since this function is called only once, save space:
1264 * don't have "static const sp_point p256_base = {...}",
1265 * it would have more zeros than data.
1266 */
Denys Vlasenko48a18d12021-04-27 12:24:21 +02001267 static const uint8_t p256_base_bin[] = {
1268 /* x (big-endian) */
1269 0x6b,0x17,0xd1,0xf2,0xe1,0x2c,0x42,0x47,0xf8,0xbc,0xe6,0xe5,0x63,0xa4,0x40,0xf2,0x77,0x03,0x7d,0x81,0x2d,0xeb,0x33,0xa0,0xf4,0xa1,0x39,0x45,0xd8,0x98,0xc2,0x96,
1270 /* y */
1271 0x4f,0xe3,0x42,0xe2,0xfe,0x1a,0x7f,0x9b,0x8e,0xe7,0xeb,0x4a,0x7c,0x0f,0x9e,0x16,0x2b,0xce,0x33,0x57,0x6b,0x31,0x5e,0xce,0xcb,0xb6,0x40,0x68,0x37,0xbf,0x51,0xf5,
Denys Vlasenko646e8562021-04-27 13:09:44 +02001272 /* z will be set to 1, infinity flag to "false" */
Denys Vlasenko39a3ef52021-04-27 01:31:51 +02001273 };
1274 sp_point p256_base;
1275
Denys Vlasenko48a18d12021-04-27 12:24:21 +02001276 sp_256_point_from_bin2x32(&p256_base, p256_base_bin);
Denys Vlasenko39a3ef52021-04-27 01:31:51 +02001277
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001278 sp_256_ecc_mulmod_8(r, &p256_base, k /*, map*/);
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001279}
1280
1281/* Multiply the point by the scalar and serialize the X ordinate.
1282 * The number is 0 padded to maximum size on output.
1283 *
1284 * priv Scalar to multiply the point by.
Denys Vlasenko074b33b2021-04-26 14:33:38 +02001285 * pub2x32 Point to multiply.
1286 * out32 Buffer to hold X ordinate.
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001287 */
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001288static void sp_ecc_secret_gen_256(const sp_digit priv[8], const uint8_t *pub2x32, uint8_t* out32)
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001289{
Denys Vlasenko12040122021-04-26 20:24:34 +02001290 sp_point point[1];
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001291
1292#if FIXED_PEER_PUBKEY
Denys Vlasenko12040122021-04-26 20:24:34 +02001293 memset((void*)pub2x32, 0x55, 64);
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001294#endif
Denys Vlasenko12040122021-04-26 20:24:34 +02001295 dump_hex("peerkey %s\n", pub2x32, 32); /* in TLS, this is peer's public key */
1296 dump_hex(" %s\n", pub2x32 + 32, 32);
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001297
Denys Vlasenko12040122021-04-26 20:24:34 +02001298 sp_256_point_from_bin2x32(point, pub2x32);
Denys Vlasenko81d8af12021-10-05 17:31:33 +02001299 dump_512("point->x %s\n", point->x);
1300 dump_512("point->y %s\n", point->y);
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001301
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001302 sp_256_ecc_mulmod_8(point, point, priv);
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001303
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001304 sp_256_to_bin_8(point->x, out32);
Denys Vlasenko12040122021-04-26 20:24:34 +02001305 dump_hex("out32: %s\n", out32, 32);
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001306}
1307
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001308/* Generates a random scalar in [1..order-1] range. */
1309static void sp_256_ecc_gen_k_8(sp_digit k[8])
Denys Vlasenko074b33b2021-04-26 14:33:38 +02001310{
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001311 /* Since 32-bit words are "dense", no need to use
1312 * sp_256_from_bin_8(k, buf) to convert random stream
1313 * to sp_digit array - just store random bits there directly.
1314 */
1315 tls_get_random(k, 8 * sizeof(k[0]));
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001316#if FIXED_SECRET
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001317 memset(k, 0x77, 8 * sizeof(k[0]));
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001318#endif
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001319
1320// If scalar is too large, try again (pseudo-code)
1321// if (k >= 0xffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551 - 1) // order of P256
1322// goto pick_another_random;
1323// k++; // ensure non-zero
1324 /* Simpler alternative, at the cost of not choosing some valid
1325 * random values, and slightly non-uniform distribution */
1326 if (k[0] == 0)
1327 k[0] = 1;
1328 if (k[7] >= 0xffffffff)
1329 k[7] = 0xfffffffe;
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001330}
1331
Denys Vlasenko074b33b2021-04-26 14:33:38 +02001332/* Makes a random EC key pair. */
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001333static void sp_ecc_make_key_256(sp_digit privkey[8], uint8_t *pubkey)
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001334{
1335 sp_point point[1];
1336
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001337 sp_256_ecc_gen_k_8(privkey);
Denys Vlasenko137864f2021-10-05 13:47:42 +02001338 dump_256("privkey %s\n", privkey);
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001339 sp_256_ecc_mulmod_base_8(point, privkey);
Denys Vlasenko137864f2021-10-05 13:47:42 +02001340 dump_512("point->x %s\n", point->x);
1341 dump_512("point->y %s\n", point->y);
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001342 sp_256_to_bin_8(point->x, pubkey);
1343 sp_256_to_bin_8(point->y, pubkey + 32);
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001344
1345 memset(point, 0, sizeof(point)); //paranoia
1346}
1347
1348void FAST_FUNC curve_P256_compute_pubkey_and_premaster(
Denys Vlasenko074b33b2021-04-26 14:33:38 +02001349 uint8_t *pubkey2x32, uint8_t *premaster32,
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001350 const uint8_t *peerkey2x32)
1351{
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001352 sp_digit privkey[8];
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001353
Denys Vlasenko3b411eb2021-10-05 20:00:50 +02001354 dump_hex("peerkey2x32: %s\n", peerkey2x32, 64);
Denys Vlasenko074b33b2021-04-26 14:33:38 +02001355 sp_ecc_make_key_256(privkey, pubkey2x32);
1356 dump_hex("pubkey: %s\n", pubkey2x32, 32);
1357 dump_hex(" %s\n", pubkey2x32 + 32, 32);
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001358
Denys Vlasenko074b33b2021-04-26 14:33:38 +02001359 /* Combine our privkey and peer's public key to generate premaster */
Denys Vlasenkof18a1fd2021-04-26 13:25:56 +02001360 sp_ecc_secret_gen_256(privkey, /*x,y:*/peerkey2x32, premaster32);
1361 dump_hex("premaster: %s\n", premaster32, 32);
1362}