blob: 8604132d616d65de593bbf357a6f56875ee1fb7f [file] [log] [blame]
Denys Vlasenko11d00962017-01-15 00:12:42 +01001/*
2 * Copyright (C) 2017 Denys Vlasenko
3 *
4 * Licensed under GPLv2, see file LICENSE in this source tree.
5 */
6#include "tls.h"
7
Denys Vlasenko3f8ecd92017-01-15 14:16:51 +01008/* The file is taken almost verbatim from matrixssl-3-7-2b-open/crypto/math/.
Denys Vlasenko6b1b0042017-01-19 15:51:00 +01009 * Changes are flagged with //bbox
Denys Vlasenko3f8ecd92017-01-15 14:16:51 +010010 */
11
Denys Vlasenko11d00962017-01-15 00:12:42 +010012/**
13 * @file pstm_sqr_comba.c
14 * @version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
15 *
16 * Multiprecision Squaring with Comba technique.
17 */
18/*
19 * Copyright (c) 2013-2015 INSIDE Secure Corporation
20 * Copyright (c) PeerSec Networks, 2002-2011
21 * All Rights Reserved
22 *
23 * The latest version of this code is available at http://www.matrixssl.org
24 *
25 * This software is open source; you can redistribute it and/or modify
26 * it under the terms of the GNU General Public License as published by
27 * the Free Software Foundation; either version 2 of the License, or
28 * (at your option) any later version.
29 *
30 * This General Public License does NOT permit incorporating this software
31 * into proprietary programs. If you are unable to comply with the GPL, a
32 * commercial license for this software may be purchased from INSIDE at
33 * http://www.insidesecure.com/eng/Company/Locations
34 *
35 * This program is distributed in WITHOUT ANY WARRANTY; without even the
36 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
37 * See the GNU General Public License for more details.
38 *
39 * You should have received a copy of the GNU General Public License
40 * along with this program; if not, write to the Free Software
41 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
42 * http://www.gnu.org/copyleft/gpl.html
43 */
44/******************************************************************************/
45
Denys Vlasenko6b1b0042017-01-19 15:51:00 +010046//bbox
Denys Vlasenko11d00962017-01-15 00:12:42 +010047//#include "../cryptoApi.h"
48#ifndef DISABLE_PSTM
49
50/******************************************************************************/
51#if defined(PSTM_X86)
52/* x86-32 optimized for 32 bit platforms. For 64 bit mode use X86_64 instead */
53#if !defined(__GNUC__) || !defined(__i386__)
54#error "PSTM_X86 option requires GCC and 32 bit mode x86 processor"
55#endif
56//#pragma message ("Using 32 bit x86 Assembly Optimizations")
57
58#define COMBA_START
59
60#define CLEAR_CARRY \
61 c0 = c1 = c2 = 0;
62
63#define COMBA_STORE(x) \
64 x = c0;
65
66#define COMBA_STORE2(x) \
67 x = c1;
68
69#define CARRY_FORWARD \
70 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
71
72#define COMBA_FINI
73
74#define SQRADD(i, j) \
75asm( \
76 "movl %6,%%eax \n\t" \
77 "mull %%eax \n\t" \
78 "addl %%eax,%0 \n\t" \
79 "adcl %%edx,%1 \n\t" \
80 "adcl $0,%2 \n\t" \
Denys Vlasenko79376ec2017-07-15 17:13:08 +020081 :"=rm"(c0), "=rm"(c1), "=rm"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc");
82 //bbox: ^^^ replaced "=r" with "=rm": %ebx is not available on shared build
Denys Vlasenko11d00962017-01-15 00:12:42 +010083
84#define SQRADD2(i, j) \
85asm( \
86 "movl %6,%%eax \n\t" \
87 "mull %7 \n\t" \
88 "addl %%eax,%0 \n\t" \
89 "adcl %%edx,%1 \n\t" \
90 "adcl $0,%2 \n\t" \
91 "addl %%eax,%0 \n\t" \
92 "adcl %%edx,%1 \n\t" \
93 "adcl $0,%2 \n\t" \
Denys Vlasenko79376ec2017-07-15 17:13:08 +020094 :"=rm"(c0), "=rm"(c1), "=rm"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc");
95 //bbox: ^^^ replaced "=r" with "=rm": %ebx is not available on shared build
Denys Vlasenko11d00962017-01-15 00:12:42 +010096
97#define SQRADDSC(i, j) \
98asm( \
99 "movl %6,%%eax \n\t" \
100 "mull %7 \n\t" \
101 "movl %%eax,%0 \n\t" \
102 "movl %%edx,%1 \n\t" \
103 "xorl %2,%2 \n\t" \
104 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");
105
106#define SQRADDAC(i, j) \
107asm( \
108 "movl %6,%%eax \n\t" \
109 "mull %7 \n\t" \
110 "addl %%eax,%0 \n\t" \
111 "adcl %%edx,%1 \n\t" \
112 "adcl $0,%2 \n\t" \
113 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");
114
115#define SQRADDDB \
116asm( \
117 "addl %6,%0 \n\t" \
118 "adcl %7,%1 \n\t" \
119 "adcl %8,%2 \n\t" \
120 "addl %6,%0 \n\t" \
121 "adcl %7,%1 \n\t" \
122 "adcl %8,%2 \n\t" \
123 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "%cc");
124
125/******************************************************************************/
126#elif defined(PSTM_X86_64)
127/* x86-64 optimized */
128#if !defined(__GNUC__) || !defined(__x86_64__) || !defined(PSTM_64BIT)
129#error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor"
130#endif
131//#pragma message ("Using 64 bit x86_64 Assembly Optimizations")
132
133#define COMBA_START
134
135#define CLEAR_CARRY \
136c0 = c1 = c2 = 0;
137
138#define COMBA_STORE(x) \
139x = c0;
140
141#define COMBA_STORE2(x) \
142x = c1;
143
144#define CARRY_FORWARD \
145do { c0 = c1; c1 = c2; c2 = 0; } while (0);
146
147#define COMBA_FINI
148
149#define SQRADD(i, j) \
150asm( \
151 "movq %6,%%rax \n\t" \
152 "mulq %%rax \n\t" \
153 "addq %%rax,%0 \n\t" \
154 "adcq %%rdx,%1 \n\t" \
155 "adcq $0,%2 \n\t" \
156 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","cc");
157
158#define SQRADD2(i, j) \
159asm( \
160 "movq %6,%%rax \n\t" \
161 "mulq %7 \n\t" \
162 "addq %%rax,%0 \n\t" \
163 "adcq %%rdx,%1 \n\t" \
164 "adcq $0,%2 \n\t" \
165 "addq %%rax,%0 \n\t" \
166 "adcq %%rdx,%1 \n\t" \
167 "adcq $0,%2 \n\t" \
168 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc");
169
170#define SQRADDSC(i, j) \
171asm( \
172 "movq %6,%%rax \n\t" \
173 "mulq %7 \n\t" \
174 "movq %%rax,%0 \n\t" \
175 "movq %%rdx,%1 \n\t" \
176 "xorq %2,%2 \n\t" \
177 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
178
179#define SQRADDAC(i, j) \
180asm( \
181 "movq %6,%%rax \n\t" \
182 "mulq %7 \n\t" \
183 "addq %%rax,%0 \n\t" \
184 "adcq %%rdx,%1 \n\t" \
185 "adcq $0,%2 \n\t" \
186 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
187
188#define SQRADDDB \
189asm( \
190 "addq %6,%0 \n\t" \
191 "adcq %7,%1 \n\t" \
192 "adcq %8,%2 \n\t" \
193 "addq %6,%0 \n\t" \
194 "adcq %7,%1 \n\t" \
195 "adcq %8,%2 \n\t" \
196 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
197
198/******************************************************************************/
199#elif defined(PSTM_ARM)
200/* ARM code */
201//#pragma message ("Using 32 bit ARM Assembly Optimizations")
202
203#define COMBA_START
204
205#define CLEAR_CARRY \
206c0 = c1 = c2 = 0;
207
208#define COMBA_STORE(x) \
209x = c0;
210
211#define COMBA_STORE2(x) \
212x = c1;
213
214#define CARRY_FORWARD \
215do { c0 = c1; c1 = c2; c2 = 0; } while (0);
216
217#define COMBA_FINI
218
219/* multiplies point i and j, updates carry "c1" and digit c2 */
220#define SQRADD(i, j) \
221asm( \
222" UMULL r0,r1,%6,%6 \n\t" \
223" ADDS %0,%0,r0 \n\t" \
224" ADCS %1,%1,r1 \n\t" \
225" ADC %2,%2,#0 \n\t" \
226:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "%cc");
227
228/* for squaring some of the terms are doubled... */
229#define SQRADD2(i, j) \
230asm( \
231" UMULL r0,r1,%6,%7 \n\t" \
232" ADDS %0,%0,r0 \n\t" \
233" ADCS %1,%1,r1 \n\t" \
234" ADC %2,%2,#0 \n\t" \
235" ADDS %0,%0,r0 \n\t" \
236" ADCS %1,%1,r1 \n\t" \
237" ADC %2,%2,#0 \n\t" \
238:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "%cc");
239
240#define SQRADDSC(i, j) \
241asm( \
242" UMULL %0,%1,%6,%7 \n\t" \
243" SUB %2,%2,%2 \n\t" \
244:"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "%cc");
245
246#define SQRADDAC(i, j) \
247asm( \
248" UMULL r0,r1,%6,%7 \n\t" \
249" ADDS %0,%0,r0 \n\t" \
250" ADCS %1,%1,r1 \n\t" \
251" ADC %2,%2,#0 \n\t" \
252:"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "%cc");
253
254#define SQRADDDB \
255asm( \
256" ADDS %0,%0,%3 \n\t" \
257" ADCS %1,%1,%4 \n\t" \
258" ADC %2,%2,%5 \n\t" \
259" ADDS %0,%0,%3 \n\t" \
260" ADCS %1,%1,%4 \n\t" \
261" ADC %2,%2,%5 \n\t" \
262:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "%cc");
263
264/******************************************************************************/
265#elif defined(PSTM_MIPS)
266/* MIPS32 */
267//#pragma message ("Using 32 bit MIPS Assembly Optimizations")
268
269#define COMBA_START
270
271#define CLEAR_CARRY \
272c0 = c1 = c2 = 0;
273
274#define COMBA_STORE(x) \
275x = c0;
276
277#define COMBA_STORE2(x) \
278x = c1;
279
280#define CARRY_FORWARD \
281do { c0 = c1; c1 = c2; c2 = 0; } while (0);
282
283#define COMBA_FINI
284
285/* multiplies point i and j, updates carry "c1" and digit c2 */
286#define SQRADD(i, j) \
287asm( \
288 " multu %6,%6 \n\t" \
289 " mflo $12 \n\t" \
290 " mfhi $13 \n\t" \
291 " addu %0,%0,$12 \n\t" \
292 " sltu $12,%0,$12 \n\t" \
293 " addu %1,%1,$13 \n\t" \
294 " sltu $13,%1,$13 \n\t" \
295 " addu %1,%1,$12 \n\t" \
296 " sltu $12,%1,$12 \n\t" \
297 " addu %2,%2,$13 \n\t" \
298 " addu %2,%2,$12 \n\t" \
299 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"$12","$13");
300
301/* for squaring some of the terms are doubled... */
302#define SQRADD2(i, j) \
303asm( \
304 " multu %6,%7 \n\t" \
305 " mflo $12 \n\t" \
306 " mfhi $13 \n\t" \
307 \
308 " addu %0,%0,$12 \n\t" \
309 " sltu $14,%0,$12 \n\t" \
310 " addu %1,%1,$13 \n\t" \
311 " sltu $15,%1,$13 \n\t" \
312 " addu %1,%1,$14 \n\t" \
313 " sltu $14,%1,$14 \n\t" \
314 " addu %2,%2,$15 \n\t" \
315 " addu %2,%2,$14 \n\t" \
316 \
317 " addu %0,%0,$12 \n\t" \
318 " sltu $14,%0,$12 \n\t" \
319 " addu %1,%1,$13 \n\t" \
320 " sltu $15,%1,$13 \n\t" \
321 " addu %1,%1,$14 \n\t" \
322 " sltu $14,%1,$14 \n\t" \
323 " addu %2,%2,$15 \n\t" \
324 " addu %2,%2,$14 \n\t" \
325 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12", "$13", "$14", "$15");
326
327#define SQRADDSC(i, j) \
328asm( \
329 " multu %6,%7 \n\t" \
330 " mflo %0 \n\t" \
331 " mfhi %1 \n\t" \
332 " xor %2,%2,%2 \n\t" \
333 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "%cc");
334
335#define SQRADDAC(i, j) \
336asm( \
337 " multu %6,%7 \n\t" \
338 " mflo $12 \n\t" \
339 " mfhi $13 \n\t" \
340 " addu %0,%0,$12 \n\t" \
341 " sltu $12,%0,$12 \n\t" \
342 " addu %1,%1,$13 \n\t" \
343 " sltu $13,%1,$13 \n\t" \
344 " addu %1,%1,$12 \n\t" \
345 " sltu $12,%1,$12 \n\t" \
346 " addu %2,%2,$13 \n\t" \
347 " addu %2,%2,$12 \n\t" \
348 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"$12", "$13", "$14");
349
350#define SQRADDDB \
351asm( \
352 " addu %0,%0,%3 \n\t" \
353 " sltu $10,%0,%3 \n\t" \
354 " addu %1,%1,$10 \n\t" \
355 " sltu $10,%1,$10 \n\t" \
356 " addu %1,%1,%4 \n\t" \
357 " sltu $11,%1,%4 \n\t" \
358 " addu %2,%2,$10 \n\t" \
359 " addu %2,%2,$11 \n\t" \
360 " addu %2,%2,%5 \n\t" \
361 \
362 " addu %0,%0,%3 \n\t" \
363 " sltu $10,%0,%3 \n\t" \
364 " addu %1,%1,$10 \n\t" \
365 " sltu $10,%1,$10 \n\t" \
366 " addu %1,%1,%4 \n\t" \
367 " sltu $11,%1,%4 \n\t" \
368 " addu %2,%2,$10 \n\t" \
369 " addu %2,%2,$11 \n\t" \
370 " addu %2,%2,%5 \n\t" \
371 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "$10", "$11");
372
373#else
374/******************************************************************************/
375#define PSTM_ISO
376/* ISO C portable code */
377
378#define COMBA_START
379
380#define CLEAR_CARRY \
381 c0 = c1 = c2 = 0;
382
383#define COMBA_STORE(x) \
384 x = c0;
385
386#define COMBA_STORE2(x) \
387 x = c1;
388
389#define CARRY_FORWARD \
390 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
391
392#define COMBA_FINI
393
394/* multiplies point i and j, updates carry "c1" and digit c2 */
395#define SQRADD(i, j) \
396 do { pstm_word t; \
397 t = c0 + ((pstm_word)i) * ((pstm_word)j); c0 = (pstm_digit)t; \
398 t = c1 + (t >> DIGIT_BIT); \
399 c1 = (pstm_digit)t; c2 += (pstm_digit)(t >> DIGIT_BIT); \
400 } while (0);
401
402
403/* for squaring some of the terms are doubled... */
404#define SQRADD2(i, j) \
405 do { pstm_word t; \
406 t = ((pstm_word)i) * ((pstm_word)j); \
407 tt = (pstm_word)c0 + t; c0 = (pstm_digit)tt; \
408 tt = (pstm_word)c1 + (tt >> DIGIT_BIT); \
409 c1 = (pstm_digit)tt; c2 += (pstm_digit)(tt >> DIGIT_BIT); \
410 tt = (pstm_word)c0 + t; c0 = (pstm_digit)tt; \
411 tt = (pstm_word)c1 + (tt >> DIGIT_BIT); \
412 c1 = (pstm_digit)tt; c2 += (pstm_digit)(tt >> DIGIT_BIT); \
413 } while (0);
414
415#define SQRADDSC(i, j) \
416 do { pstm_word t; \
417 t = ((pstm_word)i) * ((pstm_word)j); \
418 sc0 = (pstm_digit)t; sc1 = (pstm_digit)(t >> DIGIT_BIT); sc2 = 0; \
419 } while (0);
420
421#define SQRADDAC(i, j) \
422 do { pstm_word t; \
423 t = ((pstm_word)sc0) + ((pstm_word)i) * ((pstm_word)j); \
424 sc0 = (pstm_digit)t; \
425 t = ((pstm_word)sc1) + (t >> DIGIT_BIT); sc1 = (pstm_digit)t; \
426 sc2 += (pstm_digit)(t >> DIGIT_BIT); \
427 } while (0);
428
429#define SQRADDDB \
430 do { pstm_word t; \
431 t = ((pstm_word)sc0) + ((pstm_word)sc0) + ((pstm_word)c0); \
432 c0 = (pstm_digit)t; \
433 t = ((pstm_word)sc1) + ((pstm_word)sc1) + c1 + (t >> DIGIT_BIT); \
434 c1 = (pstm_digit)t; \
435 c2 = c2 + sc2 + sc2 + (pstm_digit)(t >> DIGIT_BIT); \
436 } while (0);
437
438#endif /* ISO_C */
439
440/******************************************************************************/
441/*
442 Non-unrolled comba squarer
443 */
Denys Vlasenko6b1b0042017-01-19 15:51:00 +0100444//bbox: pool unused
Denys Vlasenko11d00962017-01-15 00:12:42 +0100445#define pstm_sqr_comba_gen(pool, A, B, paD, paDlen) \
446 pstm_sqr_comba_gen( A, B, paD, paDlen)
447static int32 pstm_sqr_comba_gen(psPool_t *pool, pstm_int *A, pstm_int *B,
448 pstm_digit *paD, uint32 paDlen)
449{
Denys Vlasenko3d7ec482017-07-15 17:19:38 +0200450 int paDfail, pa; //bbox: was int16
Denys Vlasenko11d00962017-01-15 00:12:42 +0100451 int32 ix, iz;
452 pstm_digit c0, c1, c2, *dst;
453#ifdef PSTM_ISO
454 pstm_word tt;
455#endif
456
457 paDfail = 0;
458 /* get size of output and trim */
459 pa = A->used + A->used;
460
461 /* number of output digits to produce */
462 COMBA_START;
463 CLEAR_CARRY;
464/*
465 If b is not large enough grow it and continue
466*/
467 if (B->alloc < pa) {
468 if (pstm_grow(B, pa) != PSTM_OKAY) {
469 return PS_MEM_FAIL;
470 }
471 }
472 if (paD != NULL) {
473 if (paDlen < (sizeof(pstm_digit) * pa)) {
474 paDfail = 1; /* have a paD, but it's not big enough */
Denys Vlasenko6b1b0042017-01-19 15:51:00 +0100475 dst = xzalloc(sizeof(pstm_digit) * pa);//bbox
Denys Vlasenko11d00962017-01-15 00:12:42 +0100476 } else {
477 dst = paD;
478 memset(dst, 0x0, paDlen);
479 }
480 } else {
Denys Vlasenko6b1b0042017-01-19 15:51:00 +0100481 dst = xzalloc(sizeof(pstm_digit) * pa);//bbox
Denys Vlasenko11d00962017-01-15 00:12:42 +0100482 }
483
484 for (ix = 0; ix < pa; ix++) {
485 int32 tx, ty, iy;
486 pstm_digit *tmpy, *tmpx;
487
488 /* get offsets into the two bignums */
489 ty = min(A->used-1, ix);
490 tx = ix - ty;
491
492 /* setup temp aliases */
493 tmpx = A->dp + tx;
494 tmpy = A->dp + ty;
495
496/*
497 This is the number of times the loop will iterate,
498 while (tx++ < a->used && ty-- >= 0) { ... }
499*/
500 iy = min(A->used-tx, ty+1);
501
502/*
503 now for squaring tx can never equal ty. We halve the distance since
504 they approach at a rate of 2x and we have to round because odd cases
505 need to be executed
506*/
507 iy = min(iy, (ty-tx+1)>>1);
508
509 /* forward carries */
510 CARRY_FORWARD;
511
512 /* execute loop */
513 for (iz = 0; iz < iy; iz++) {
514 SQRADD2(*tmpx++, *tmpy--);
515 }
516
517 /* even columns have the square term in them */
518 if ((ix&1) == 0) {
519 SQRADD(A->dp[ix>>1], A->dp[ix>>1]);
520 }
521
522 /* store it */
523 COMBA_STORE(dst[ix]);
524 }
525
526 COMBA_FINI;
527/*
528 setup dest
529 */
530 iz = B->used;
531 B->used = pa;
532 {
533 pstm_digit *tmpc;
534 tmpc = B->dp;
535 for (ix = 0; ix < pa; ix++) {
536 *tmpc++ = dst[ix];
537 }
538 /* clear unused digits (that existed in the old copy of c) */
539 for (; ix < iz; ix++) {
540 *tmpc++ = 0;
541 }
542 }
543 pstm_clamp(B);
544
545 if ((paD == NULL) || paDfail == 1) {
546 psFree(dst, pool);
547 }
548 return PS_SUCCESS;
549}
550
551/******************************************************************************/
552/*
553 Unrolled Comba loop for 1024 bit keys
554 */
555#ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS
556static int32 pstm_sqr_comba16(pstm_int *A, pstm_int *B)
557{
558 pstm_digit *a, b[32], c0, c1, c2, sc0, sc1, sc2;
559#ifdef PSTM_ISO
560 pstm_word tt;
561#endif
562
563 if (B->alloc < 32) {
564 if (pstm_grow(B, 32) != PSTM_OKAY) {
565 return PS_MEM_FAIL;
566 }
567 }
568 a = A->dp;
569 sc0 = sc1 = sc2 = 0;
570
571 COMBA_START;
572
573 /* clear carries */
574 CLEAR_CARRY;
575
576 /* output 0 */
577 SQRADD(a[0],a[0]);
578 COMBA_STORE(b[0]);
579
580 /* output 1 */
581 CARRY_FORWARD;
582 SQRADD2(a[0], a[1]);
583 COMBA_STORE(b[1]);
584
585 /* output 2 */
586 CARRY_FORWARD;
587 SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]);
588 COMBA_STORE(b[2]);
589
590 /* output 3 */
591 CARRY_FORWARD;
592 SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]);
593 COMBA_STORE(b[3]);
594
595 /* output 4 */
596 CARRY_FORWARD;
597 SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]);
598 COMBA_STORE(b[4]);
599
600 /* output 5 */
601 CARRY_FORWARD;
602 SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB;
603 COMBA_STORE(b[5]);
604
605 /* output 6 */
606 CARRY_FORWARD;
607 SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]);
608 COMBA_STORE(b[6]);
609
610 /* output 7 */
611 CARRY_FORWARD;
612 SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB;
613 COMBA_STORE(b[7]);
614
615 /* output 8 */
616 CARRY_FORWARD;
617 SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]);
618 COMBA_STORE(b[8]);
619
620 /* output 9 */
621 CARRY_FORWARD;
622 SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB;
623 COMBA_STORE(b[9]);
624
625 /* output 10 */
626 CARRY_FORWARD;
627 SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]);
628 COMBA_STORE(b[10]);
629
630 /* output 11 */
631 CARRY_FORWARD;
632 SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB;
633 COMBA_STORE(b[11]);
634
635 /* output 12 */
636 CARRY_FORWARD;
637 SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]);
638 COMBA_STORE(b[12]);
639
640 /* output 13 */
641 CARRY_FORWARD;
642 SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB;
643 COMBA_STORE(b[13]);
644
645 /* output 14 */
646 CARRY_FORWARD;
647 SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]);
648 COMBA_STORE(b[14]);
649
650 /* output 15 */
651 CARRY_FORWARD;
652 SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB;
653 COMBA_STORE(b[15]);
654
655 /* output 16 */
656 CARRY_FORWARD;
657 SQRADDSC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]);
658 COMBA_STORE(b[16]);
659
660 /* output 17 */
661 CARRY_FORWARD;
662 SQRADDSC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB;
663 COMBA_STORE(b[17]);
664
665 /* output 18 */
666 CARRY_FORWARD;
667 SQRADDSC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]);
668 COMBA_STORE(b[18]);
669
670 /* output 19 */
671 CARRY_FORWARD;
672 SQRADDSC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB;
673 COMBA_STORE(b[19]);
674
675 /* output 20 */
676 CARRY_FORWARD;
677 SQRADDSC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]);
678 COMBA_STORE(b[20]);
679
680 /* output 21 */
681 CARRY_FORWARD;
682 SQRADDSC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB;
683 COMBA_STORE(b[21]);
684
685 /* output 22 */
686 CARRY_FORWARD;
687 SQRADDSC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]);
688 COMBA_STORE(b[22]);
689
690 /* output 23 */
691 CARRY_FORWARD;
692 SQRADDSC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB;
693 COMBA_STORE(b[23]);
694
695 /* output 24 */
696 CARRY_FORWARD;
697 SQRADDSC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]);
698 COMBA_STORE(b[24]);
699
700 /* output 25 */
701 CARRY_FORWARD;
702 SQRADDSC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB;
703 COMBA_STORE(b[25]);
704
705 /* output 26 */
706 CARRY_FORWARD;
707 SQRADD2(a[11], a[15]); SQRADD2(a[12], a[14]); SQRADD(a[13], a[13]);
708 COMBA_STORE(b[26]);
709
710 /* output 27 */
711 CARRY_FORWARD;
712 SQRADD2(a[12], a[15]); SQRADD2(a[13], a[14]);
713 COMBA_STORE(b[27]);
714
715 /* output 28 */
716 CARRY_FORWARD;
717 SQRADD2(a[13], a[15]); SQRADD(a[14], a[14]);
718 COMBA_STORE(b[28]);
719
720 /* output 29 */
721 CARRY_FORWARD;
722 SQRADD2(a[14], a[15]);
723 COMBA_STORE(b[29]);
724
725 /* output 30 */
726 CARRY_FORWARD;
727 SQRADD(a[15], a[15]);
728 COMBA_STORE(b[30]);
729 COMBA_STORE2(b[31]);
730 COMBA_FINI;
731
732 B->used = 32;
733 B->sign = PSTM_ZPOS;
734 memcpy(B->dp, b, 32 * sizeof(pstm_digit));
735 pstm_clamp(B);
736 return PSTM_OKAY;
737}
738#endif /* USE_1024_KEY_SPEED_OPTIMIZATIONS */
739
740
741#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
742static int32 pstm_sqr_comba32(pstm_int *A, pstm_int *B)
743{
744 pstm_digit *a, b[64], c0, c1, c2, sc0, sc1, sc2;
745#ifdef PSTM_ISO
746 pstm_word tt;
747#endif
748
749 if (B->alloc < 64) {
750 if (pstm_grow(B, 64) != PSTM_OKAY) {
751 return PS_MEM_FAIL;
752 }
753 }
754 sc0 = sc1 = sc2 = 0;
755 a = A->dp;
756 COMBA_START;
757
758 /* clear carries */
759 CLEAR_CARRY;
760
761 /* output 0 */
762 SQRADD(a[0],a[0]);
763 COMBA_STORE(b[0]);
764
765 /* output 1 */
766 CARRY_FORWARD;
767 SQRADD2(a[0], a[1]);
768 COMBA_STORE(b[1]);
769
770 /* output 2 */
771 CARRY_FORWARD;
772 SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]);
773 COMBA_STORE(b[2]);
774
775 /* output 3 */
776 CARRY_FORWARD;
777 SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]);
778 COMBA_STORE(b[3]);
779
780 /* output 4 */
781 CARRY_FORWARD;
782 SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]);
783 COMBA_STORE(b[4]);
784
785 /* output 5 */
786 CARRY_FORWARD;
787 SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB;
788 COMBA_STORE(b[5]);
789
790 /* output 6 */
791 CARRY_FORWARD;
792 SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]);
793 COMBA_STORE(b[6]);
794
795 /* output 7 */
796 CARRY_FORWARD;
797 SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB;
798 COMBA_STORE(b[7]);
799
800 /* output 8 */
801 CARRY_FORWARD;
802 SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]);
803 COMBA_STORE(b[8]);
804
805 /* output 9 */
806 CARRY_FORWARD;
807 SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB;
808 COMBA_STORE(b[9]);
809
810 /* output 10 */
811 CARRY_FORWARD;
812 SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]);
813 COMBA_STORE(b[10]);
814
815 /* output 11 */
816 CARRY_FORWARD;
817 SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB;
818 COMBA_STORE(b[11]);
819
820 /* output 12 */
821 CARRY_FORWARD;
822 SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]);
823 COMBA_STORE(b[12]);
824
825 /* output 13 */
826 CARRY_FORWARD;
827 SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB;
828 COMBA_STORE(b[13]);
829
830 /* output 14 */
831 CARRY_FORWARD;
832 SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]);
833 COMBA_STORE(b[14]);
834
835 /* output 15 */
836 CARRY_FORWARD;
837 SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB;
838 COMBA_STORE(b[15]);
839
840 /* output 16 */
841 CARRY_FORWARD;
842 SQRADDSC(a[0], a[16]); SQRADDAC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]);
843 COMBA_STORE(b[16]);
844
845 /* output 17 */
846 CARRY_FORWARD;
847 SQRADDSC(a[0], a[17]); SQRADDAC(a[1], a[16]); SQRADDAC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB;
848 COMBA_STORE(b[17]);
849
850 /* output 18 */
851 CARRY_FORWARD;
852 SQRADDSC(a[0], a[18]); SQRADDAC(a[1], a[17]); SQRADDAC(a[2], a[16]); SQRADDAC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]);
853 COMBA_STORE(b[18]);
854
855 /* output 19 */
856 CARRY_FORWARD;
857 SQRADDSC(a[0], a[19]); SQRADDAC(a[1], a[18]); SQRADDAC(a[2], a[17]); SQRADDAC(a[3], a[16]); SQRADDAC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB;
858 COMBA_STORE(b[19]);
859
860 /* output 20 */
861 CARRY_FORWARD;
862 SQRADDSC(a[0], a[20]); SQRADDAC(a[1], a[19]); SQRADDAC(a[2], a[18]); SQRADDAC(a[3], a[17]); SQRADDAC(a[4], a[16]); SQRADDAC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]);
863 COMBA_STORE(b[20]);
864
865 /* output 21 */
866 CARRY_FORWARD;
867 SQRADDSC(a[0], a[21]); SQRADDAC(a[1], a[20]); SQRADDAC(a[2], a[19]); SQRADDAC(a[3], a[18]); SQRADDAC(a[4], a[17]); SQRADDAC(a[5], a[16]); SQRADDAC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB;
868 COMBA_STORE(b[21]);
869
870 /* output 22 */
871 CARRY_FORWARD;
872 SQRADDSC(a[0], a[22]); SQRADDAC(a[1], a[21]); SQRADDAC(a[2], a[20]); SQRADDAC(a[3], a[19]); SQRADDAC(a[4], a[18]); SQRADDAC(a[5], a[17]); SQRADDAC(a[6], a[16]); SQRADDAC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]);
873 COMBA_STORE(b[22]);
874
875 /* output 23 */
876 CARRY_FORWARD;
877 SQRADDSC(a[0], a[23]); SQRADDAC(a[1], a[22]); SQRADDAC(a[2], a[21]); SQRADDAC(a[3], a[20]); SQRADDAC(a[4], a[19]); SQRADDAC(a[5], a[18]); SQRADDAC(a[6], a[17]); SQRADDAC(a[7], a[16]); SQRADDAC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB;
878 COMBA_STORE(b[23]);
879
880 /* output 24 */
881 CARRY_FORWARD;
882 SQRADDSC(a[0], a[24]); SQRADDAC(a[1], a[23]); SQRADDAC(a[2], a[22]); SQRADDAC(a[3], a[21]); SQRADDAC(a[4], a[20]); SQRADDAC(a[5], a[19]); SQRADDAC(a[6], a[18]); SQRADDAC(a[7], a[17]); SQRADDAC(a[8], a[16]); SQRADDAC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]);
883 COMBA_STORE(b[24]);
884
885 /* output 25 */
886 CARRY_FORWARD;
887 SQRADDSC(a[0], a[25]); SQRADDAC(a[1], a[24]); SQRADDAC(a[2], a[23]); SQRADDAC(a[3], a[22]); SQRADDAC(a[4], a[21]); SQRADDAC(a[5], a[20]); SQRADDAC(a[6], a[19]); SQRADDAC(a[7], a[18]); SQRADDAC(a[8], a[17]); SQRADDAC(a[9], a[16]); SQRADDAC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB;
888 COMBA_STORE(b[25]);
889
890 /* output 26 */
891 CARRY_FORWARD;
892 SQRADDSC(a[0], a[26]); SQRADDAC(a[1], a[25]); SQRADDAC(a[2], a[24]); SQRADDAC(a[3], a[23]); SQRADDAC(a[4], a[22]); SQRADDAC(a[5], a[21]); SQRADDAC(a[6], a[20]); SQRADDAC(a[7], a[19]); SQRADDAC(a[8], a[18]); SQRADDAC(a[9], a[17]); SQRADDAC(a[10], a[16]); SQRADDAC(a[11], a[15]); SQRADDAC(a[12], a[14]); SQRADDDB; SQRADD(a[13], a[13]);
893 COMBA_STORE(b[26]);
894
895 /* output 27 */
896 CARRY_FORWARD;
897 SQRADDSC(a[0], a[27]); SQRADDAC(a[1], a[26]); SQRADDAC(a[2], a[25]); SQRADDAC(a[3], a[24]); SQRADDAC(a[4], a[23]); SQRADDAC(a[5], a[22]); SQRADDAC(a[6], a[21]); SQRADDAC(a[7], a[20]); SQRADDAC(a[8], a[19]); SQRADDAC(a[9], a[18]); SQRADDAC(a[10], a[17]); SQRADDAC(a[11], a[16]); SQRADDAC(a[12], a[15]); SQRADDAC(a[13], a[14]); SQRADDDB;
898 COMBA_STORE(b[27]);
899
900 /* output 28 */
901 CARRY_FORWARD;
902 SQRADDSC(a[0], a[28]); SQRADDAC(a[1], a[27]); SQRADDAC(a[2], a[26]); SQRADDAC(a[3], a[25]); SQRADDAC(a[4], a[24]); SQRADDAC(a[5], a[23]); SQRADDAC(a[6], a[22]); SQRADDAC(a[7], a[21]); SQRADDAC(a[8], a[20]); SQRADDAC(a[9], a[19]); SQRADDAC(a[10], a[18]); SQRADDAC(a[11], a[17]); SQRADDAC(a[12], a[16]); SQRADDAC(a[13], a[15]); SQRADDDB; SQRADD(a[14], a[14]);
903 COMBA_STORE(b[28]);
904
905 /* output 29 */
906 CARRY_FORWARD;
907 SQRADDSC(a[0], a[29]); SQRADDAC(a[1], a[28]); SQRADDAC(a[2], a[27]); SQRADDAC(a[3], a[26]); SQRADDAC(a[4], a[25]); SQRADDAC(a[5], a[24]); SQRADDAC(a[6], a[23]); SQRADDAC(a[7], a[22]); SQRADDAC(a[8], a[21]); SQRADDAC(a[9], a[20]); SQRADDAC(a[10], a[19]); SQRADDAC(a[11], a[18]); SQRADDAC(a[12], a[17]); SQRADDAC(a[13], a[16]); SQRADDAC(a[14], a[15]); SQRADDDB;
908 COMBA_STORE(b[29]);
909
910 /* output 30 */
911 CARRY_FORWARD;
912 SQRADDSC(a[0], a[30]); SQRADDAC(a[1], a[29]); SQRADDAC(a[2], a[28]); SQRADDAC(a[3], a[27]); SQRADDAC(a[4], a[26]); SQRADDAC(a[5], a[25]); SQRADDAC(a[6], a[24]); SQRADDAC(a[7], a[23]); SQRADDAC(a[8], a[22]); SQRADDAC(a[9], a[21]); SQRADDAC(a[10], a[20]); SQRADDAC(a[11], a[19]); SQRADDAC(a[12], a[18]); SQRADDAC(a[13], a[17]); SQRADDAC(a[14], a[16]); SQRADDDB; SQRADD(a[15], a[15]);
913 COMBA_STORE(b[30]);
914
915 /* output 31 */
916 CARRY_FORWARD;
917 SQRADDSC(a[0], a[31]); SQRADDAC(a[1], a[30]); SQRADDAC(a[2], a[29]); SQRADDAC(a[3], a[28]); SQRADDAC(a[4], a[27]); SQRADDAC(a[5], a[26]); SQRADDAC(a[6], a[25]); SQRADDAC(a[7], a[24]); SQRADDAC(a[8], a[23]); SQRADDAC(a[9], a[22]); SQRADDAC(a[10], a[21]); SQRADDAC(a[11], a[20]); SQRADDAC(a[12], a[19]); SQRADDAC(a[13], a[18]); SQRADDAC(a[14], a[17]); SQRADDAC(a[15], a[16]); SQRADDDB;
918 COMBA_STORE(b[31]);
919
920 /* output 32 */
921 CARRY_FORWARD;
922 SQRADDSC(a[1], a[31]); SQRADDAC(a[2], a[30]); SQRADDAC(a[3], a[29]); SQRADDAC(a[4], a[28]); SQRADDAC(a[5], a[27]); SQRADDAC(a[6], a[26]); SQRADDAC(a[7], a[25]); SQRADDAC(a[8], a[24]); SQRADDAC(a[9], a[23]); SQRADDAC(a[10], a[22]); SQRADDAC(a[11], a[21]); SQRADDAC(a[12], a[20]); SQRADDAC(a[13], a[19]); SQRADDAC(a[14], a[18]); SQRADDAC(a[15], a[17]); SQRADDDB; SQRADD(a[16], a[16]);
923 COMBA_STORE(b[32]);
924
925 /* output 33 */
926 CARRY_FORWARD;
927 SQRADDSC(a[2], a[31]); SQRADDAC(a[3], a[30]); SQRADDAC(a[4], a[29]); SQRADDAC(a[5], a[28]); SQRADDAC(a[6], a[27]); SQRADDAC(a[7], a[26]); SQRADDAC(a[8], a[25]); SQRADDAC(a[9], a[24]); SQRADDAC(a[10], a[23]); SQRADDAC(a[11], a[22]); SQRADDAC(a[12], a[21]); SQRADDAC(a[13], a[20]); SQRADDAC(a[14], a[19]); SQRADDAC(a[15], a[18]); SQRADDAC(a[16], a[17]); SQRADDDB;
928 COMBA_STORE(b[33]);
929
930 /* output 34 */
931 CARRY_FORWARD;
932 SQRADDSC(a[3], a[31]); SQRADDAC(a[4], a[30]); SQRADDAC(a[5], a[29]); SQRADDAC(a[6], a[28]); SQRADDAC(a[7], a[27]); SQRADDAC(a[8], a[26]); SQRADDAC(a[9], a[25]); SQRADDAC(a[10], a[24]); SQRADDAC(a[11], a[23]); SQRADDAC(a[12], a[22]); SQRADDAC(a[13], a[21]); SQRADDAC(a[14], a[20]); SQRADDAC(a[15], a[19]); SQRADDAC(a[16], a[18]); SQRADDDB; SQRADD(a[17], a[17]);
933 COMBA_STORE(b[34]);
934
935 /* output 35 */
936 CARRY_FORWARD;
937 SQRADDSC(a[4], a[31]); SQRADDAC(a[5], a[30]); SQRADDAC(a[6], a[29]); SQRADDAC(a[7], a[28]); SQRADDAC(a[8], a[27]); SQRADDAC(a[9], a[26]); SQRADDAC(a[10], a[25]); SQRADDAC(a[11], a[24]); SQRADDAC(a[12], a[23]); SQRADDAC(a[13], a[22]); SQRADDAC(a[14], a[21]); SQRADDAC(a[15], a[20]); SQRADDAC(a[16], a[19]); SQRADDAC(a[17], a[18]); SQRADDDB;
938 COMBA_STORE(b[35]);
939
940 /* output 36 */
941 CARRY_FORWARD;
942 SQRADDSC(a[5], a[31]); SQRADDAC(a[6], a[30]); SQRADDAC(a[7], a[29]); SQRADDAC(a[8], a[28]); SQRADDAC(a[9], a[27]); SQRADDAC(a[10], a[26]); SQRADDAC(a[11], a[25]); SQRADDAC(a[12], a[24]); SQRADDAC(a[13], a[23]); SQRADDAC(a[14], a[22]); SQRADDAC(a[15], a[21]); SQRADDAC(a[16], a[20]); SQRADDAC(a[17], a[19]); SQRADDDB; SQRADD(a[18], a[18]);
943 COMBA_STORE(b[36]);
944
945 /* output 37 */
946 CARRY_FORWARD;
947 SQRADDSC(a[6], a[31]); SQRADDAC(a[7], a[30]); SQRADDAC(a[8], a[29]); SQRADDAC(a[9], a[28]); SQRADDAC(a[10], a[27]); SQRADDAC(a[11], a[26]); SQRADDAC(a[12], a[25]); SQRADDAC(a[13], a[24]); SQRADDAC(a[14], a[23]); SQRADDAC(a[15], a[22]); SQRADDAC(a[16], a[21]); SQRADDAC(a[17], a[20]); SQRADDAC(a[18], a[19]); SQRADDDB;
948 COMBA_STORE(b[37]);
949
950 /* output 38 */
951 CARRY_FORWARD;
952 SQRADDSC(a[7], a[31]); SQRADDAC(a[8], a[30]); SQRADDAC(a[9], a[29]); SQRADDAC(a[10], a[28]); SQRADDAC(a[11], a[27]); SQRADDAC(a[12], a[26]); SQRADDAC(a[13], a[25]); SQRADDAC(a[14], a[24]); SQRADDAC(a[15], a[23]); SQRADDAC(a[16], a[22]); SQRADDAC(a[17], a[21]); SQRADDAC(a[18], a[20]); SQRADDDB; SQRADD(a[19], a[19]);
953 COMBA_STORE(b[38]);
954
955 /* output 39 */
956 CARRY_FORWARD;
957 SQRADDSC(a[8], a[31]); SQRADDAC(a[9], a[30]); SQRADDAC(a[10], a[29]); SQRADDAC(a[11], a[28]); SQRADDAC(a[12], a[27]); SQRADDAC(a[13], a[26]); SQRADDAC(a[14], a[25]); SQRADDAC(a[15], a[24]); SQRADDAC(a[16], a[23]); SQRADDAC(a[17], a[22]); SQRADDAC(a[18], a[21]); SQRADDAC(a[19], a[20]); SQRADDDB;
958 COMBA_STORE(b[39]);
959
960 /* output 40 */
961 CARRY_FORWARD;
962 SQRADDSC(a[9], a[31]); SQRADDAC(a[10], a[30]); SQRADDAC(a[11], a[29]); SQRADDAC(a[12], a[28]); SQRADDAC(a[13], a[27]); SQRADDAC(a[14], a[26]); SQRADDAC(a[15], a[25]); SQRADDAC(a[16], a[24]); SQRADDAC(a[17], a[23]); SQRADDAC(a[18], a[22]); SQRADDAC(a[19], a[21]); SQRADDDB; SQRADD(a[20], a[20]);
963 COMBA_STORE(b[40]);
964
965 /* output 41 */
966 CARRY_FORWARD;
967 SQRADDSC(a[10], a[31]); SQRADDAC(a[11], a[30]); SQRADDAC(a[12], a[29]); SQRADDAC(a[13], a[28]); SQRADDAC(a[14], a[27]); SQRADDAC(a[15], a[26]); SQRADDAC(a[16], a[25]); SQRADDAC(a[17], a[24]); SQRADDAC(a[18], a[23]); SQRADDAC(a[19], a[22]); SQRADDAC(a[20], a[21]); SQRADDDB;
968 COMBA_STORE(b[41]);
969
970 /* output 42 */
971 CARRY_FORWARD;
972 SQRADDSC(a[11], a[31]); SQRADDAC(a[12], a[30]); SQRADDAC(a[13], a[29]); SQRADDAC(a[14], a[28]); SQRADDAC(a[15], a[27]); SQRADDAC(a[16], a[26]); SQRADDAC(a[17], a[25]); SQRADDAC(a[18], a[24]); SQRADDAC(a[19], a[23]); SQRADDAC(a[20], a[22]); SQRADDDB; SQRADD(a[21], a[21]);
973 COMBA_STORE(b[42]);
974
975 /* output 43 */
976 CARRY_FORWARD;
977 SQRADDSC(a[12], a[31]); SQRADDAC(a[13], a[30]); SQRADDAC(a[14], a[29]); SQRADDAC(a[15], a[28]); SQRADDAC(a[16], a[27]); SQRADDAC(a[17], a[26]); SQRADDAC(a[18], a[25]); SQRADDAC(a[19], a[24]); SQRADDAC(a[20], a[23]); SQRADDAC(a[21], a[22]); SQRADDDB;
978 COMBA_STORE(b[43]);
979
980 /* output 44 */
981 CARRY_FORWARD;
982 SQRADDSC(a[13], a[31]); SQRADDAC(a[14], a[30]); SQRADDAC(a[15], a[29]); SQRADDAC(a[16], a[28]); SQRADDAC(a[17], a[27]); SQRADDAC(a[18], a[26]); SQRADDAC(a[19], a[25]); SQRADDAC(a[20], a[24]); SQRADDAC(a[21], a[23]); SQRADDDB; SQRADD(a[22], a[22]);
983 COMBA_STORE(b[44]);
984
985 /* output 45 */
986 CARRY_FORWARD;
987 SQRADDSC(a[14], a[31]); SQRADDAC(a[15], a[30]); SQRADDAC(a[16], a[29]); SQRADDAC(a[17], a[28]); SQRADDAC(a[18], a[27]); SQRADDAC(a[19], a[26]); SQRADDAC(a[20], a[25]); SQRADDAC(a[21], a[24]); SQRADDAC(a[22], a[23]); SQRADDDB;
988 COMBA_STORE(b[45]);
989
990 /* output 46 */
991 CARRY_FORWARD;
992 SQRADDSC(a[15], a[31]); SQRADDAC(a[16], a[30]); SQRADDAC(a[17], a[29]); SQRADDAC(a[18], a[28]); SQRADDAC(a[19], a[27]); SQRADDAC(a[20], a[26]); SQRADDAC(a[21], a[25]); SQRADDAC(a[22], a[24]); SQRADDDB; SQRADD(a[23], a[23]);
993 COMBA_STORE(b[46]);
994
995 /* output 47 */
996 CARRY_FORWARD;
997 SQRADDSC(a[16], a[31]); SQRADDAC(a[17], a[30]); SQRADDAC(a[18], a[29]); SQRADDAC(a[19], a[28]); SQRADDAC(a[20], a[27]); SQRADDAC(a[21], a[26]); SQRADDAC(a[22], a[25]); SQRADDAC(a[23], a[24]); SQRADDDB;
998 COMBA_STORE(b[47]);
999
1000 /* output 48 */
1001 CARRY_FORWARD;
1002 SQRADDSC(a[17], a[31]); SQRADDAC(a[18], a[30]); SQRADDAC(a[19], a[29]); SQRADDAC(a[20], a[28]); SQRADDAC(a[21], a[27]); SQRADDAC(a[22], a[26]); SQRADDAC(a[23], a[25]); SQRADDDB; SQRADD(a[24], a[24]);
1003 COMBA_STORE(b[48]);
1004
1005 /* output 49 */
1006 CARRY_FORWARD;
1007 SQRADDSC(a[18], a[31]); SQRADDAC(a[19], a[30]); SQRADDAC(a[20], a[29]); SQRADDAC(a[21], a[28]); SQRADDAC(a[22], a[27]); SQRADDAC(a[23], a[26]); SQRADDAC(a[24], a[25]); SQRADDDB;
1008 COMBA_STORE(b[49]);
1009
1010 /* output 50 */
1011 CARRY_FORWARD;
1012 SQRADDSC(a[19], a[31]); SQRADDAC(a[20], a[30]); SQRADDAC(a[21], a[29]); SQRADDAC(a[22], a[28]); SQRADDAC(a[23], a[27]); SQRADDAC(a[24], a[26]); SQRADDDB; SQRADD(a[25], a[25]);
1013 COMBA_STORE(b[50]);
1014
1015 /* output 51 */
1016 CARRY_FORWARD;
1017 SQRADDSC(a[20], a[31]); SQRADDAC(a[21], a[30]); SQRADDAC(a[22], a[29]); SQRADDAC(a[23], a[28]); SQRADDAC(a[24], a[27]); SQRADDAC(a[25], a[26]); SQRADDDB;
1018 COMBA_STORE(b[51]);
1019
1020 /* output 52 */
1021 CARRY_FORWARD;
1022 SQRADDSC(a[21], a[31]); SQRADDAC(a[22], a[30]); SQRADDAC(a[23], a[29]); SQRADDAC(a[24], a[28]); SQRADDAC(a[25], a[27]); SQRADDDB; SQRADD(a[26], a[26]);
1023 COMBA_STORE(b[52]);
1024
1025 /* output 53 */
1026 CARRY_FORWARD;
1027 SQRADDSC(a[22], a[31]); SQRADDAC(a[23], a[30]); SQRADDAC(a[24], a[29]); SQRADDAC(a[25], a[28]); SQRADDAC(a[26], a[27]); SQRADDDB;
1028 COMBA_STORE(b[53]);
1029
1030 /* output 54 */
1031 CARRY_FORWARD;
1032 SQRADDSC(a[23], a[31]); SQRADDAC(a[24], a[30]); SQRADDAC(a[25], a[29]); SQRADDAC(a[26], a[28]); SQRADDDB; SQRADD(a[27], a[27]);
1033 COMBA_STORE(b[54]);
1034
1035 /* output 55 */
1036 CARRY_FORWARD;
1037 SQRADDSC(a[24], a[31]); SQRADDAC(a[25], a[30]); SQRADDAC(a[26], a[29]); SQRADDAC(a[27], a[28]); SQRADDDB;
1038 COMBA_STORE(b[55]);
1039
1040 /* output 56 */
1041 CARRY_FORWARD;
1042 SQRADDSC(a[25], a[31]); SQRADDAC(a[26], a[30]); SQRADDAC(a[27], a[29]); SQRADDDB; SQRADD(a[28], a[28]);
1043 COMBA_STORE(b[56]);
1044
1045 /* output 57 */
1046 CARRY_FORWARD;
1047 SQRADDSC(a[26], a[31]); SQRADDAC(a[27], a[30]); SQRADDAC(a[28], a[29]); SQRADDDB;
1048 COMBA_STORE(b[57]);
1049
1050 /* output 58 */
1051 CARRY_FORWARD;
1052 SQRADD2(a[27], a[31]); SQRADD2(a[28], a[30]); SQRADD(a[29], a[29]);
1053 COMBA_STORE(b[58]);
1054
1055 /* output 59 */
1056 CARRY_FORWARD;
1057 SQRADD2(a[28], a[31]); SQRADD2(a[29], a[30]);
1058 COMBA_STORE(b[59]);
1059
1060 /* output 60 */
1061 CARRY_FORWARD;
1062 SQRADD2(a[29], a[31]); SQRADD(a[30], a[30]);
1063 COMBA_STORE(b[60]);
1064
1065 /* output 61 */
1066 CARRY_FORWARD;
1067 SQRADD2(a[30], a[31]);
1068 COMBA_STORE(b[61]);
1069
1070 /* output 62 */
1071 CARRY_FORWARD;
1072 SQRADD(a[31], a[31]);
1073 COMBA_STORE(b[62]);
1074 COMBA_STORE2(b[63]);
1075 COMBA_FINI;
1076
1077 B->used = 64;
1078 B->sign = PSTM_ZPOS;
1079 memcpy(B->dp, b, 64 * sizeof(pstm_digit));
1080 pstm_clamp(B);
1081 return PSTM_OKAY;
1082}
1083#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
1084
1085/******************************************************************************/
1086/*
1087 */
Denys Vlasenko37bdd8f2019-01-01 15:40:43 +01001088int32 FAST_FUNC pstm_sqr_comba(psPool_t *pool, pstm_int *A, pstm_int *B, pstm_digit *paD,
Denys Vlasenko11d00962017-01-15 00:12:42 +01001089 uint32 paDlen)
1090{
1091#ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS
1092 if (A->used == 16) {
1093 return pstm_sqr_comba16(A, B);
1094 } else {
1095#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
1096 if (A->used == 32) {
1097 return pstm_sqr_comba32(A, B);
1098 }
1099#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
1100 return pstm_sqr_comba_gen(pool, A, B, paD, paDlen);
1101 }
1102#else
1103#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
1104 if (A->used == 32) {
1105 return pstm_sqr_comba32(A, B);
1106 }
1107#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
1108 return pstm_sqr_comba_gen(pool, A, B, paD, paDlen);
1109#endif
1110}
1111
1112#endif /* DISABLE_PSTM */
1113/******************************************************************************/