blob: 4181a0590065dfcba8ff8783b7140a8773ff8f2e [file] [log] [blame]
Denys Vlasenko11d00962017-01-15 00:12:42 +01001/*
2 * Copyright (C) 2017 Denys Vlasenko
3 *
4 * Licensed under GPLv2, see file LICENSE in this source tree.
5 */
6#include "tls.h"
7
Denys Vlasenko3f8ecd92017-01-15 14:16:51 +01008/* The file is taken almost verbatim from matrixssl-3-7-2b-open/crypto/math/.
Denys Vlasenko6b1b0042017-01-19 15:51:00 +01009 * Changes are flagged with //bbox
Denys Vlasenko3f8ecd92017-01-15 14:16:51 +010010 */
11
Denys Vlasenko11d00962017-01-15 00:12:42 +010012/**
13 * @file pstm_montgomery_reduce.c
14 * @version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
15 *
16 * Multiprecision Montgomery Reduction.
17 */
18/*
19 * Copyright (c) 2013-2015 INSIDE Secure Corporation
20 * Copyright (c) PeerSec Networks, 2002-2011
21 * All Rights Reserved
22 *
23 * The latest version of this code is available at http://www.matrixssl.org
24 *
25 * This software is open source; you can redistribute it and/or modify
26 * it under the terms of the GNU General Public License as published by
27 * the Free Software Foundation; either version 2 of the License, or
28 * (at your option) any later version.
29 *
30 * This General Public License does NOT permit incorporating this software
31 * into proprietary programs. If you are unable to comply with the GPL, a
32 * commercial license for this software may be purchased from INSIDE at
33 * http://www.insidesecure.com/eng/Company/Locations
34 *
35 * This program is distributed in WITHOUT ANY WARRANTY; without even the
36 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
37 * See the GNU General Public License for more details.
38 *
39 * You should have received a copy of the GNU General Public License
40 * along with this program; if not, write to the Free Software
41 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
42 * http://www.gnu.org/copyleft/gpl.html
43 */
44/******************************************************************************/
45
Denys Vlasenko6b1b0042017-01-19 15:51:00 +010046//bbox
Denys Vlasenko11d00962017-01-15 00:12:42 +010047//#include "../cryptoApi.h"
48#ifndef DISABLE_PSTM
49
50/******************************************************************************/
51
52#if defined(PSTM_X86)
53/* x86-32 optimized for 32 bit platforms. For 64 bit mode use X86_64 instead */
54#if !defined(__GNUC__) || !defined(__i386__) || !defined(PSTM_32BIT)
55#error "PSTM_X86 option requires GCC and 32 bit mode x86 processor"
56#endif
57//#pragma message ("Using 32 bit x86 Assembly Optimizations")
58
59#define MONT_START
60#define MONT_FINI
61#define LOOP_END
62#define LOOP_START \
63 mu = c[x] * mp
64
Denys Vlasenko21367b22021-04-20 19:01:43 +020065#if 0
Denys Vlasenko11d00962017-01-15 00:12:42 +010066#define INNERMUL \
67asm( \
68 "movl %5,%%eax \n\t" \
69 "mull %4 \n\t" \
70 "addl %1,%%eax \n\t" \
71 "adcl $0,%%edx \n\t" \
72 "addl %%eax,%0 \n\t" \
73 "adcl $0,%%edx \n\t" \
74 "movl %%edx,%1 \n\t" \
75:"=g"(_c[LO]), "=r"(cy) \
76:"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++) \
Khem Rajee9e5f92019-03-06 21:06:10 -080077: "%eax", "%edx", "cc")
Denys Vlasenko21367b22021-04-20 19:01:43 +020078/*
79 * The above generated "error: 'asm' operand has impossible constraints" on Android.
80 * Do they reserve in their ABI a register for something, and there aren't enough left?
81 */
82#else
83/* Let's avoid two explicit "movl" by telling compiler to put input value of *tmpm++
84 * into EAX, and to expect cy result in EDX:
85 */
86#define INNERMUL \
87asm( \
88 "mull %4 \n\t" \
89 "addl %3,%%eax \n\t" \
90 "adcl $0,%%edx \n\t" \
91 "addl %%eax,%0 \n\t" \
92 "adcl $0,%%edx \n\t" \
93:"=g"(_c[LO]), "=&d"(cy) \
94:"0"(_c[LO]), "g"(cy), "g"(mu), "a"(*tmpm++) \
95:"cc")
96/* This doesn't tell compiler that we clobber EAX, but it probably won't need
97 * the value of *tmpm anyway, thus won't try to reuse EAX contents.
98 * TODO: fix it with dummy "=a"(clobbered_eax) output?
99 */
100#endif
Denys Vlasenko11d00962017-01-15 00:12:42 +0100101
102#define PROPCARRY \
103asm( \
104 "addl %1,%0 \n\t" \
Denys Vlasenko20b22402021-04-20 19:03:15 +0200105 "sbb %1,%1 \n\t" \
106 "neg %1 \n\t" \
Denys Vlasenko11d00962017-01-15 00:12:42 +0100107:"=g"(_c[LO]), "=r"(cy) \
108:"0"(_c[LO]), "1"(cy) \
Denys Vlasenko20b22402021-04-20 19:03:15 +0200109:"cc")
Denys Vlasenko11d00962017-01-15 00:12:42 +0100110
111/******************************************************************************/
112#elif defined(PSTM_X86_64)
113/* x86-64 optimized */
114#if !defined(__GNUC__) || !defined(__x86_64__) || !defined(PSTM_64BIT)
115#error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor"
116#endif
117//#pragma message ("Using 64 bit x86_64 Assembly Optimizations")
118
119#define MONT_START
120#define MONT_FINI
121#define LOOP_END
122#define LOOP_START \
123mu = c[x] * mp
124
125#define INNERMUL \
126asm( \
127 "movq %5,%%rax \n\t" \
128 "mulq %4 \n\t" \
129 "addq %1,%%rax \n\t" \
130 "adcq $0,%%rdx \n\t" \
131 "addq %%rax,%0 \n\t" \
132 "adcq $0,%%rdx \n\t" \
133 "movq %%rdx,%1 \n\t" \
134 :"=g"(_c[LO]), "=r"(cy) \
135 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \
136 : "%rax", "%rdx", "cc")
137
138#define INNERMUL8 \
139asm( \
140 "movq 0(%5),%%rax \n\t" \
141 "movq 0(%2),%%r10 \n\t" \
142 "movq 0x8(%5),%%r11 \n\t" \
143 "mulq %4 \n\t" \
144 "addq %%r10,%%rax \n\t" \
145 "adcq $0,%%rdx \n\t" \
146 "movq 0x8(%2),%%r10 \n\t" \
147 "addq %3,%%rax \n\t" \
148 "adcq $0,%%rdx \n\t" \
149 "movq %%rax,0(%0) \n\t" \
150 "movq %%rdx,%1 \n\t" \
151 \
152 "movq %%r11,%%rax \n\t" \
153 "movq 0x10(%5),%%r11 \n\t" \
154 "mulq %4 \n\t" \
155 "addq %%r10,%%rax \n\t" \
156 "adcq $0,%%rdx \n\t" \
157 "movq 0x10(%2),%%r10 \n\t" \
158 "addq %3,%%rax \n\t" \
159 "adcq $0,%%rdx \n\t" \
160 "movq %%rax,0x8(%0) \n\t" \
161 "movq %%rdx,%1 \n\t" \
162 \
163 "movq %%r11,%%rax \n\t" \
164 "movq 0x18(%5),%%r11 \n\t" \
165 "mulq %4 \n\t" \
166 "addq %%r10,%%rax \n\t" \
167 "adcq $0,%%rdx \n\t" \
168 "movq 0x18(%2),%%r10 \n\t" \
169 "addq %3,%%rax \n\t" \
170 "adcq $0,%%rdx \n\t" \
171 "movq %%rax,0x10(%0) \n\t" \
172 "movq %%rdx,%1 \n\t" \
173 \
174 "movq %%r11,%%rax \n\t" \
175 "movq 0x20(%5),%%r11 \n\t" \
176 "mulq %4 \n\t" \
177 "addq %%r10,%%rax \n\t" \
178 "adcq $0,%%rdx \n\t" \
179 "movq 0x20(%2),%%r10 \n\t" \
180 "addq %3,%%rax \n\t" \
181 "adcq $0,%%rdx \n\t" \
182 "movq %%rax,0x18(%0) \n\t" \
183 "movq %%rdx,%1 \n\t" \
184 \
185 "movq %%r11,%%rax \n\t" \
186 "movq 0x28(%5),%%r11 \n\t" \
187 "mulq %4 \n\t" \
188 "addq %%r10,%%rax \n\t" \
189 "adcq $0,%%rdx \n\t" \
190 "movq 0x28(%2),%%r10 \n\t" \
191 "addq %3,%%rax \n\t" \
192 "adcq $0,%%rdx \n\t" \
193 "movq %%rax,0x20(%0) \n\t" \
194 "movq %%rdx,%1 \n\t" \
195 \
196 "movq %%r11,%%rax \n\t" \
197 "movq 0x30(%5),%%r11 \n\t" \
198 "mulq %4 \n\t" \
199 "addq %%r10,%%rax \n\t" \
200 "adcq $0,%%rdx \n\t" \
201 "movq 0x30(%2),%%r10 \n\t" \
202 "addq %3,%%rax \n\t" \
203 "adcq $0,%%rdx \n\t" \
204 "movq %%rax,0x28(%0) \n\t" \
205 "movq %%rdx,%1 \n\t" \
206 \
207 "movq %%r11,%%rax \n\t" \
208 "movq 0x38(%5),%%r11 \n\t" \
209 "mulq %4 \n\t" \
210 "addq %%r10,%%rax \n\t" \
211 "adcq $0,%%rdx \n\t" \
212 "movq 0x38(%2),%%r10 \n\t" \
213 "addq %3,%%rax \n\t" \
214 "adcq $0,%%rdx \n\t" \
215 "movq %%rax,0x30(%0) \n\t" \
216 "movq %%rdx,%1 \n\t" \
217 \
218 "movq %%r11,%%rax \n\t" \
219 "mulq %4 \n\t" \
220 "addq %%r10,%%rax \n\t" \
221 "adcq $0,%%rdx \n\t" \
222 "addq %3,%%rax \n\t" \
223 "adcq $0,%%rdx \n\t" \
224 "movq %%rax,0x38(%0) \n\t" \
225 "movq %%rdx,%1 \n\t" \
226 \
227 :"=r"(_c), "=r"(cy) \
228 : "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\
229 : "%rax", "%rdx", "%r10", "%r11", "cc")
230
231#define PROPCARRY \
232asm( \
233 "addq %1,%0 \n\t" \
234 "setb %%al \n\t" \
235 "movzbq %%al,%1 \n\t" \
236 :"=g"(_c[LO]), "=r"(cy) \
237 :"0"(_c[LO]), "1"(cy) \
238 : "%rax", "cc")
239
240/******************************************************************************/
241#elif defined(PSTM_ARM)
242
243#define MONT_START
244#define MONT_FINI
245#define LOOP_END
246#define LOOP_START \
247mu = c[x] * mp
248
249#ifdef __thumb2__
250//#pragma message ("Using 32 bit ARM Thumb2 Assembly Optimizations")
251#define INNERMUL \
252asm( \
253 " LDR r0,%1 \n\t" \
254 " ADDS r0,r0,%0 \n\t" \
255 " ITE CS \n\t" \
256 " MOVCS %0,#1 \n\t" \
257 " MOVCC %0,#0 \n\t" \
258 " UMLAL r0,%0,%3,%4 \n\t" \
259 " STR r0,%1 \n\t" \
260 :"=r"(cy),"=m"(_c[0])\
261 :"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0])\
Khem Rajee9e5f92019-03-06 21:06:10 -0800262 :"r0","cc");
Denys Vlasenko11d00962017-01-15 00:12:42 +0100263#define PROPCARRY \
264asm( \
265 " LDR r0,%1 \n\t" \
266 " ADDS r0,r0,%0 \n\t" \
267 " STR r0,%1 \n\t" \
268 " ITE CS \n\t" \
269 " MOVCS %0,#1 \n\t" \
270 " MOVCC %0,#0 \n\t" \
271 :"=r"(cy),"=m"(_c[0])\
272 :"0"(cy),"m"(_c[0])\
Khem Rajee9e5f92019-03-06 21:06:10 -0800273 :"r0","cc");
Denys Vlasenko11d00962017-01-15 00:12:42 +0100274#else /* Non-Thumb2 code */
275//#pragma message ("Using 32 bit ARM Assembly Optimizations")
276#define INNERMUL \
277asm( \
278 " LDR r0,%1 \n\t" \
279 " ADDS r0,r0,%0 \n\t" \
280 " MOVCS %0,#1 \n\t" \
281 " MOVCC %0,#0 \n\t" \
282 " UMLAL r0,%0,%3,%4 \n\t" \
283 " STR r0,%1 \n\t" \
284 :"=r"(cy),"=m"(_c[0])\
285 :"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0])\
Khem Rajee9e5f92019-03-06 21:06:10 -0800286 :"r0","cc");
Denys Vlasenko11d00962017-01-15 00:12:42 +0100287#define PROPCARRY \
288asm( \
289 " LDR r0,%1 \n\t" \
290 " ADDS r0,r0,%0 \n\t" \
291 " STR r0,%1 \n\t" \
292 " MOVCS %0,#1 \n\t" \
293 " MOVCC %0,#0 \n\t" \
294 :"=r"(cy),"=m"(_c[0])\
295 :"0"(cy),"m"(_c[0])\
Khem Rajee9e5f92019-03-06 21:06:10 -0800296 :"r0","cc");
Denys Vlasenko11d00962017-01-15 00:12:42 +0100297#endif /* __thumb2__ */
298
299
300/******************************************************************************/
301#elif defined(PSTM_MIPS)
302/* MIPS32 */
303//#pragma message ("Using 32 bit MIPS Assembly Optimizations")
304#define MONT_START
305#define MONT_FINI
306#define LOOP_END
307#define LOOP_START \
308mu = c[x] * mp
309
310#define INNERMUL \
311asm( \
312 " multu %3,%4 \n\t" \
313 " mflo $12 \n\t" \
314 " mfhi $13 \n\t" \
315 " addu $12,$12,%0 \n\t" \
316 " sltu $10,$12,%0 \n\t" \
317 " addu $13,$13,$10 \n\t" \
318 " lw $10,%1 \n\t" \
319 " addu $12,$12,$10 \n\t" \
320 " sltu $10,$12,$10 \n\t" \
321 " addu %0,$13,$10 \n\t" \
322 " sw $12,%1 \n\t" \
323 :"=r"(cy),"=m"(_c[0])\
324 :"r"(cy),"r"(mu),"r"(tmpm[0]),"r"(_c[0])\
325 :"$10","$12","$13")\
326; ++tmpm;
327
328#define PROPCARRY \
329asm( \
330 " lw $10,%1 \n\t" \
331 " addu $10,$10,%0 \n\t" \
332 " sw $10,%1 \n\t" \
333 " sltu %0,$10,%0 \n\t" \
334 :"=r"(cy),"=m"(_c[0])\
335 :"r"(cy),"r"(_c[0])\
336 :"$10");
337
338
339/******************************************************************************/
340#else
341
342/* ISO C code */
343#define MONT_START
344#define MONT_FINI
345#define LOOP_END
346#define LOOP_START \
347 mu = c[x] * mp
348
349#define INNERMUL \
350 do { pstm_word t; \
351 t = ((pstm_word)_c[0] + (pstm_word)cy) + \
352 (((pstm_word)mu) * ((pstm_word)*tmpm++)); \
353 _c[0] = (pstm_digit)t; \
354 cy = (pstm_digit)(t >> DIGIT_BIT); \
355 } while (0)
356
357#define PROPCARRY \
358 do { pstm_digit t = _c[0] += cy; cy = (t < cy); } while (0)
359
360#endif
361
362/******************************************************************************/
363
364#define LO 0
365
366/* computes x/R == x (mod N) via Montgomery Reduction */
Denys Vlasenko37bdd8f2019-01-01 15:40:43 +0100367int32 FAST_FUNC pstm_montgomery_reduce(psPool_t *pool, pstm_int *a, pstm_int *m,
Denys Vlasenko11d00962017-01-15 00:12:42 +0100368 pstm_digit mp, pstm_digit *paD, uint32 paDlen)
369{
370 pstm_digit *c, *_c, *tmpm, mu;
371 int32 oldused, x, y;
Denys Vlasenko3d7ec482017-07-15 17:19:38 +0200372 int pa; //bbox: was int16
Denys Vlasenko11d00962017-01-15 00:12:42 +0100373
374 pa = m->used;
375 if (pa > a->alloc) {
376 /* Sanity test for bad numbers. This will confirm no buffer overruns */
377 return PS_LIMIT_FAIL;
378 }
379
380 if (paD && paDlen >= (uint32)2*pa+1) {
381 c = paD;
382 memset(c, 0x0, paDlen);
383 } else {
Denys Vlasenko6b1b0042017-01-19 15:51:00 +0100384 c = xzalloc(2*pa+1);//bbox
Denys Vlasenko11d00962017-01-15 00:12:42 +0100385 }
386 /* copy the input */
387 oldused = a->used;
388 for (x = 0; x < oldused; x++) {
389 c[x] = a->dp[x];
390 }
391
392 MONT_START;
393
394 for (x = 0; x < pa; x++) {
395 pstm_digit cy = 0;
396 /* get Mu for this round */
397 LOOP_START;
398 _c = c + x;
399 tmpm = m->dp;
400 y = 0;
401#ifdef PSTM_X86_64
402 for (; y < (pa & ~7); y += 8) {
403 INNERMUL8;
404 _c += 8;
405 tmpm += 8;
406 }
407#endif /* PSTM_X86_64 */
408 for (; y < pa; y++) {
409 INNERMUL;
410 ++_c;
411 }
412 LOOP_END;
413 while (cy) {
414 PROPCARRY;
415 ++_c;
416 }
417 }
418
419 /* now copy out */
420 _c = c + pa;
421 tmpm = a->dp;
422 for (x = 0; x < pa+1; x++) {
423 *tmpm++ = *_c++;
424 }
425
426 for (; x < oldused; x++) {
427 *tmpm++ = 0;
428 }
429
430 MONT_FINI;
431
432 a->used = pa+1;
433 pstm_clamp(a);
434
435 /* reuse x as return code */
436 x = PSTM_OKAY;
437
438 /* if A >= m then A = A - m */
439 if (pstm_cmp_mag (a, m) != PSTM_LT) {
440 if (s_pstm_sub (a, m, a) != PSTM_OKAY) {
441 x = PS_MEM_FAIL;
442 }
443 }
444 if (paDlen < (uint32)2*pa+1) {
445 psFree(c, pool);
446 }
447 return x;
448}
449
450#endif /* !DISABLE_PSTM */
451/******************************************************************************/