networking/tls_pstm_montgomery_reduce.c - codeaurora/busybox - Gitiles

 /*
  * Copyright (C) 2017 Denys Vlasenko
  *
  * Licensed under GPLv2, see file LICENSE in this source tree.
  */
 #include "tls.h"

 /* The file is taken almost verbatim from matrixssl-3-7-2b-open/crypto/math/.
  * Changes are flagged with //bbox
  */

 /**
  *	@file    pstm_montgomery_reduce.c
  *	@version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
  *
  *	Multiprecision Montgomery Reduction.
  */
 /*
  *	Copyright (c) 2013-2015 INSIDE Secure Corporation
  *	Copyright (c) PeerSec Networks, 2002-2011
  *	All Rights Reserved
  *
  *	The latest version of this code is available at http://www.matrixssl.org
  *
  *	This software is open source; you can redistribute it and/or modify
  *	it under the terms of the GNU General Public License as published by
  *	the Free Software Foundation; either version 2 of the License, or
  *	(at your option) any later version.
  *
  *	This General Public License does NOT permit incorporating this software
  *	into proprietary programs.  If you are unable to comply with the GPL, a
  *	commercial license for this software may be purchased from INSIDE at
  *	http://www.insidesecure.com/eng/Company/Locations
  *
  *	This program is distributed in WITHOUT ANY WARRANTY; without even the
  *	implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  *	See the GNU General Public License for more details.
  *
  *	You should have received a copy of the GNU General Public License
  *	along with this program; if not, write to the Free Software
  *	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *	http://www.gnu.org/copyleft/gpl.html
  */
 /******************************************************************************/

 //bbox
 //#include "../cryptoApi.h"
 #ifndef DISABLE_PSTM

 /******************************************************************************/

 #if defined(PSTM_X86)
 /* x86-32 optimized for 32 bit platforms. For 64 bit mode use X86_64 instead */
 #if !defined(__GNUC__) || !defined(__i386__) || !defined(PSTM_32BIT)
 #error "PSTM_X86 option requires GCC and 32 bit mode x86 processor"
 #endif
 //#pragma message ("Using 32 bit x86 Assembly Optimizations")

 #define MONT_START
 #define MONT_FINI
 #define LOOP_END
 #define LOOP_START \
    mu = c[x] * mp

 #if 0
 #define INNERMUL                                          \
 asm(                                                      \
    "movl %5,%%eax \n\t"                                   \
    "mull %4       \n\t"                                   \
    "addl %1,%%eax \n\t"                                   \
    "adcl $0,%%edx \n\t"                                   \
    "addl %%eax,%0 \n\t"                                   \
    "adcl $0,%%edx \n\t"                                   \
    "movl %%edx,%1 \n\t"                                   \
 :"=g"(_c[LO]), "=r"(cy)                                   \
 :"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++)              \
 : "%eax", "%edx", "cc")
 /*
  * The above generated "error: 'asm' operand has impossible constraints" on Android.
  * Do they reserve in their ABI a register for something, and there aren't enough left?
  */
 #else
 /* Let's avoid two explicit "movl" by telling compiler to put input value of *tmpm++
  * into EAX, and to expect cy result in EDX:
  */
 #define INNERMUL                                          \
 asm(                                                      \
    "mull %4       \n\t"                                   \
    "addl %3,%%eax \n\t"                                   \
    "adcl $0,%%edx \n\t"                                   \
    "addl %%eax,%0 \n\t"                                   \
    "adcl $0,%%edx \n\t"                                   \
 :"=g"(_c[LO]), "=&d"(cy)                                  \
 :"0"(_c[LO]), "g"(cy), "g"(mu), "a"(*tmpm++)              \
 :"cc")
 /* This doesn't tell compiler that we clobber EAX, but it probably won't need
  * the value of *tmpm anyway, thus won't try to reuse EAX contents.
  * TODO: fix it with dummy "=a"(clobbered_eax) output?
  */
 #endif

 #define PROPCARRY                           \
 asm(                                        \
    "addl   %1,%0    \n\t"                   \
    "sbb    %1,%1    \n\t"                   \
    "neg    %1       \n\t"                   \
 :"=g"(_c[LO]), "=r"(cy)                     \
 :"0"(_c[LO]), "1"(cy)                       \
 :"cc")

 /******************************************************************************/
 #elif defined(PSTM_X86_64)
 /* x86-64 optimized */
 #if !defined(__GNUC__) || !defined(__x86_64__) || !defined(PSTM_64BIT)
 #error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor"
 #endif
 //#pragma message ("Using 64 bit x86_64 Assembly Optimizations")

 #define MONT_START
 #define MONT_FINI
 #define LOOP_END
 #define LOOP_START \
 mu = c[x] * mp

 #define INNERMUL                                           \
 asm(                                                       \
 	"movq %5,%%rax \n\t"                                   \
 	"mulq %4       \n\t"                                   \
 	"addq %1,%%rax \n\t"                                   \
 	"adcq $0,%%rdx \n\t"                                   \
 	"addq %%rax,%0 \n\t"                                   \
 	"adcq $0,%%rdx \n\t"                                   \
 	"movq %%rdx,%1 \n\t"                                   \
 	:"=g"(_c[LO]), "=r"(cy)                                \
 	:"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++)           \
 	: "%rax", "%rdx", "cc")

 #define INNERMUL8				\
 asm(							\
 	"movq 0(%5),%%rax    \n\t"  \
 	"movq 0(%2),%%r10    \n\t"  \
 	"movq 0x8(%5),%%r11  \n\t"  \
 	"mulq %4             \n\t"  \
 	"addq %%r10,%%rax    \n\t"  \
 	"adcq $0,%%rdx       \n\t"  \
 	"movq 0x8(%2),%%r10  \n\t"  \
 	"addq %3,%%rax       \n\t"  \
 	"adcq $0,%%rdx       \n\t"  \
 	"movq %%rax,0(%0)    \n\t"  \
 	"movq %%rdx,%1       \n\t"  \
 	\
 	"movq %%r11,%%rax    \n\t"  \
 	"movq 0x10(%5),%%r11 \n\t"  \
 	"mulq %4             \n\t"  \
 	"addq %%r10,%%rax    \n\t"  \
 	"adcq $0,%%rdx       \n\t"  \
 	"movq 0x10(%2),%%r10 \n\t"  \
 	"addq %3,%%rax       \n\t"  \
 	"adcq $0,%%rdx       \n\t"  \
 	"movq %%rax,0x8(%0)  \n\t"  \
 	"movq %%rdx,%1       \n\t"  \
 	\
 	"movq %%r11,%%rax    \n\t"  \
 	"movq 0x18(%5),%%r11 \n\t"  \
 	"mulq %4             \n\t"  \
 	"addq %%r10,%%rax    \n\t"  \
 	"adcq $0,%%rdx       \n\t"  \
 	"movq 0x18(%2),%%r10 \n\t"  \
 	"addq %3,%%rax       \n\t"  \
 	"adcq $0,%%rdx       \n\t"  \
 	"movq %%rax,0x10(%0) \n\t"  \
 	"movq %%rdx,%1       \n\t"  \
 	\
 	"movq %%r11,%%rax    \n\t"  \
 	"movq 0x20(%5),%%r11 \n\t"  \
 	"mulq %4             \n\t"  \
 	"addq %%r10,%%rax    \n\t"  \
 	"adcq $0,%%rdx       \n\t"  \
 	"movq 0x20(%2),%%r10 \n\t"  \
 	"addq %3,%%rax       \n\t"  \
 	"adcq $0,%%rdx       \n\t"  \
 	"movq %%rax,0x18(%0) \n\t"  \
 	"movq %%rdx,%1       \n\t"  \
 	\
 	"movq %%r11,%%rax    \n\t"  \
 	"movq 0x28(%5),%%r11 \n\t"  \
 	"mulq %4             \n\t"  \
 	"addq %%r10,%%rax    \n\t"  \
 	"adcq $0,%%rdx       \n\t"  \
 	"movq 0x28(%2),%%r10 \n\t"  \
 	"addq %3,%%rax       \n\t"  \
 	"adcq $0,%%rdx       \n\t"  \
 	"movq %%rax,0x20(%0) \n\t"  \
 	"movq %%rdx,%1       \n\t"  \
 	\
 	"movq %%r11,%%rax    \n\t"  \
 	"movq 0x30(%5),%%r11 \n\t"  \
 	"mulq %4             \n\t"  \
 	"addq %%r10,%%rax    \n\t"  \
 	"adcq $0,%%rdx       \n\t"  \
 	"movq 0x30(%2),%%r10 \n\t"  \
 	"addq %3,%%rax       \n\t"  \
 	"adcq $0,%%rdx       \n\t"  \
 	"movq %%rax,0x28(%0) \n\t"  \
 	"movq %%rdx,%1       \n\t"  \
 	\
 	"movq %%r11,%%rax    \n\t"  \
 	"movq 0x38(%5),%%r11 \n\t"  \
 	"mulq %4             \n\t"  \
 	"addq %%r10,%%rax    \n\t"  \
 	"adcq $0,%%rdx       \n\t"  \
 	"movq 0x38(%2),%%r10 \n\t"  \
 	"addq %3,%%rax       \n\t"  \
 	"adcq $0,%%rdx       \n\t"  \
 	"movq %%rax,0x30(%0) \n\t"  \
 	"movq %%rdx,%1       \n\t"  \
 	\
 	"movq %%r11,%%rax    \n\t"  \
 	"mulq %4             \n\t"  \
 	"addq %%r10,%%rax    \n\t"  \
 	"adcq $0,%%rdx       \n\t"  \
 	"addq %3,%%rax       \n\t"  \
 	"adcq $0,%%rdx       \n\t"  \
 	"movq %%rax,0x38(%0) \n\t"  \
 	"movq %%rdx,%1       \n\t"  \
 	\
 	:"=r"(_c), "=r"(cy)                    \
 	: "0"(_c),  "1"(cy), "g"(mu), "r"(tmpm)\
 	: "%rax", "%rdx", "%r10", "%r11", "cc")

 #define PROPCARRY                          \
 asm(                                       \
 	"addq   %1,%0    \n\t"                 \
 	"setb   %%al     \n\t"                 \
 	"movzbq %%al,%1 \n\t"                  \
 	:"=g"(_c[LO]), "=r"(cy)                \
 	:"0"(_c[LO]), "1"(cy)                  \
 	: "%rax", "cc")

 /******************************************************************************/
 #elif defined(PSTM_ARM)

 #define MONT_START
 #define MONT_FINI
 #define LOOP_END
 #define LOOP_START \
 mu = c[x] * mp

 #ifdef __thumb2__
 //#pragma message ("Using 32 bit ARM Thumb2 Assembly Optimizations")
 #define INNERMUL                    \
 asm(                                \
 	" LDR    r0,%1            \n\t" \
 	" ADDS   r0,r0,%0         \n\t" \
 	" ITE CS                  \n\t" \
 	" MOVCS  %0,#1            \n\t" \
 	" MOVCC  %0,#0            \n\t" \
 	" UMLAL  r0,%0,%3,%4      \n\t" \
 	" STR    r0,%1            \n\t" \
 	:"=r"(cy),"=m"(_c[0])\
 	:"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0])\
 	:"r0","cc");
 #define PROPCARRY                  \
 asm(                               \
 	" LDR   r0,%1            \n\t" \
 	" ADDS  r0,r0,%0         \n\t" \
 	" STR   r0,%1            \n\t" \
 	" ITE CS                 \n\t" \
 	" MOVCS %0,#1            \n\t" \
 	" MOVCC %0,#0            \n\t" \
 	:"=r"(cy),"=m"(_c[0])\
 	:"0"(cy),"m"(_c[0])\
 	:"r0","cc");
 #else /* Non-Thumb2 code */
 //#pragma message ("Using 32 bit ARM Assembly Optimizations")
 #define INNERMUL                    \
 asm(                                \
 	" LDR    r0,%1            \n\t" \
 	" ADDS   r0,r0,%0         \n\t" \
 	" MOVCS  %0,#1            \n\t" \
 	" MOVCC  %0,#0            \n\t" \
 	" UMLAL  r0,%0,%3,%4      \n\t" \
 	" STR    r0,%1            \n\t" \
 	:"=r"(cy),"=m"(_c[0])\
 	:"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0])\
 	:"r0","cc");
 #define PROPCARRY                  \
 asm(                               \
 	" LDR   r0,%1            \n\t" \
 	" ADDS  r0,r0,%0         \n\t" \
 	" STR   r0,%1            \n\t" \
 	" MOVCS %0,#1            \n\t" \
 	" MOVCC %0,#0            \n\t" \
 	:"=r"(cy),"=m"(_c[0])\
 	:"0"(cy),"m"(_c[0])\
 	:"r0","cc");
 #endif /* __thumb2__ */


 /******************************************************************************/
 #elif defined(PSTM_MIPS)
 /* MIPS32 */
 //#pragma message ("Using 32 bit MIPS Assembly Optimizations")
 #define MONT_START
 #define MONT_FINI
 #define LOOP_END
 #define LOOP_START \
 mu = c[x] * mp

 #define INNERMUL                      \
 asm(								  \
 	" multu    %3,%4          \n\t"   \
 	" mflo     $12            \n\t"   \
 	" mfhi     $13            \n\t"   \
 	" addu     $12,$12,%0     \n\t"   \
 	" sltu     $10,$12,%0     \n\t"   \
 	" addu     $13,$13,$10    \n\t"   \
 	" lw       $10,%1         \n\t"   \
 	" addu     $12,$12,$10    \n\t"   \
 	" sltu     $10,$12,$10    \n\t"   \
 	" addu     %0,$13,$10     \n\t"   \
 	" sw       $12,%1         \n\t"   \
 	:"=r"(cy),"=m"(_c[0])\
 	:"r"(cy),"r"(mu),"r"(tmpm[0]),"r"(_c[0])\
 	:"$10","$12","$13")\
 ; ++tmpm;

 #define PROPCARRY                     \
 asm(                                  \
 	" lw       $10,%1        \n\t"    \
 	" addu     $10,$10,%0    \n\t"    \
 	" sw       $10,%1        \n\t"    \
 	" sltu     %0,$10,%0     \n\t"    \
 	:"=r"(cy),"=m"(_c[0])\
 	:"r"(cy),"r"(_c[0])\
 	:"$10");


 /******************************************************************************/
 #else

 /* ISO C code */
 #define MONT_START
 #define MONT_FINI
 #define LOOP_END
 #define LOOP_START \
    mu = c[x] * mp

 #define INNERMUL										\
 	do { pstm_word t;									\
 		t = ((pstm_word)_c[0] + (pstm_word)cy) +		\
 			(((pstm_word)mu) * ((pstm_word)*tmpm++));	\
 		_c[0] = (pstm_digit)t;							\
 		cy = (pstm_digit)(t >> DIGIT_BIT);				\
 	} while (0)

 #define PROPCARRY \
    do { pstm_digit t = _c[0] += cy; cy = (t < cy); } while (0)

 #endif

 /******************************************************************************/

 #define LO 0

 /* computes x/R == x (mod N) via Montgomery Reduction */
 int32 FAST_FUNC pstm_montgomery_reduce(psPool_t *pool, pstm_int *a, pstm_int *m,
 		pstm_digit mp, pstm_digit *paD, uint32 paDlen)
 {
 	pstm_digit	*c, *_c, *tmpm, mu;
 	int32		oldused, x, y;
 	int		pa; //bbox: was int16

 	pa = m->used;
 	if (pa > a->alloc) {
 		/* Sanity test for bad numbers.  This will confirm no buffer overruns */
 		return PS_LIMIT_FAIL;
 	}

 	if (paD && paDlen >= (uint32)2*pa+1) {
 		c = paD;
 		memset(c, 0x0, paDlen);
 	} else {
 		c = xzalloc(2*pa+1);//bbox
 	}
 	/* copy the input */
 	oldused = a->used;
 	for (x = 0; x < oldused; x++) {
 		c[x] = a->dp[x];
 	}

 	MONT_START;

 	for (x = 0; x < pa; x++) {
 		pstm_digit cy = 0;
 		/* get Mu for this round */
 		LOOP_START;
 		_c   = c + x;
 		tmpm = m->dp;
 		y = 0;
 #ifdef PSTM_X86_64
 		for (; y < (pa & ~7); y += 8) {
 			INNERMUL8;
 			_c   += 8;
 			tmpm += 8;
 		}
 #endif /* PSTM_X86_64 */
 		for (; y < pa; y++) {
 			INNERMUL;
 			++_c;
 		}
 		LOOP_END;
 		while (cy) {
 			PROPCARRY;
 			++_c;
 		}
 	}

 	/* now copy out */
 	_c   = c + pa;
 	tmpm = a->dp;
 	for (x = 0; x < pa+1; x++) {
 		*tmpm++ = *_c++;
 	}

 	for (; x < oldused; x++)   {
 		*tmpm++ = 0;
 	}

 	MONT_FINI;

 	a->used = pa+1;
 	pstm_clamp(a);

 	/* reuse x as return code */
 	x = PSTM_OKAY;

 	/* if A >= m then A = A - m */
 	if (pstm_cmp_mag (a, m) != PSTM_LT) {
 		if (s_pstm_sub (a, m, a) != PSTM_OKAY) {
 			x = PS_MEM_FAIL;
 		}
 	}
 	if (paDlen < (uint32)2*pa+1) {
 		psFree(c, pool);
 	}
 	return x;
 }

 #endif /* !DISABLE_PSTM */
 /******************************************************************************/
	/*
	* Copyright (C) 2017 Denys Vlasenko
	*
	* Licensed under GPLv2, see file LICENSE in this source tree.
	*/
	#include "tls.h"

	/* The file is taken almost verbatim from matrixssl-3-7-2b-open/crypto/math/.
	* Changes are flagged with //bbox
	*/

	/**
	* @file pstm_montgomery_reduce.c
	* @version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
	*
	* Multiprecision Montgomery Reduction.
	*/
	/*
	* Copyright (c) 2013-2015 INSIDE Secure Corporation
	* Copyright (c) PeerSec Networks, 2002-2011
	* All Rights Reserved
	*
	* The latest version of this code is available at http://www.matrixssl.org
	*
	* This software is open source; you can redistribute it and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation; either version 2 of the License, or
	* (at your option) any later version.
	*
	* This General Public License does NOT permit incorporating this software
	* into proprietary programs. If you are unable to comply with the GPL, a
	* commercial license for this software may be purchased from INSIDE at
	* http://www.insidesecure.com/eng/Company/Locations
	*
	* This program is distributed in WITHOUT ANY WARRANTY; without even the
	* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	* See the GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program; if not, write to the Free Software
	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	* http://www.gnu.org/copyleft/gpl.html
	*/
	/******************************************************************************/

	//bbox
	//#include "../cryptoApi.h"
	#ifndef DISABLE_PSTM

	/******************************************************************************/

	#if defined(PSTM_X86)
	/* x86-32 optimized for 32 bit platforms. For 64 bit mode use X86_64 instead */
	#if !defined(__GNUC__) \|\| !defined(__i386__) \|\| !defined(PSTM_32BIT)
	#error "PSTM_X86 option requires GCC and 32 bit mode x86 processor"
	#endif
	//#pragma message ("Using 32 bit x86 Assembly Optimizations")

	#define MONT_START
	#define MONT_FINI
	#define LOOP_END
	#define LOOP_START \
	mu = c[x] * mp

	#if 0
	#define INNERMUL \
	asm( \
	"movl %5,%%eax \n\t" \
	"mull %4 \n\t" \
	"addl %1,%%eax \n\t" \
	"adcl $0,%%edx \n\t" \
	"addl %%eax,%0 \n\t" \
	"adcl $0,%%edx \n\t" \
	"movl %%edx,%1 \n\t" \
	:"=g"(_c[LO]), "=r"(cy) \
	:"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++) \
	: "%eax", "%edx", "cc")
	/*
	* The above generated "error: 'asm' operand has impossible constraints" on Android.
	* Do they reserve in their ABI a register for something, and there aren't enough left?
	*/
	#else
	/* Let's avoid two explicit "movl" by telling compiler to put input value of *tmpm++
	* into EAX, and to expect cy result in EDX:
	*/
	#define INNERMUL \
	asm( \
	"mull %4 \n\t" \
	"addl %3,%%eax \n\t" \
	"adcl $0,%%edx \n\t" \
	"addl %%eax,%0 \n\t" \
	"adcl $0,%%edx \n\t" \
	:"=g"(_c[LO]), "=&d"(cy) \
	:"0"(_c[LO]), "g"(cy), "g"(mu), "a"(*tmpm++) \
	:"cc")
	/* This doesn't tell compiler that we clobber EAX, but it probably won't need
	* the value of *tmpm anyway, thus won't try to reuse EAX contents.
	* TODO: fix it with dummy "=a"(clobbered_eax) output?
	*/
	#endif

	#define PROPCARRY \
	asm( \
	"addl %1,%0 \n\t" \
	"sbb %1,%1 \n\t" \
	"neg %1 \n\t" \
	:"=g"(_c[LO]), "=r"(cy) \
	:"0"(_c[LO]), "1"(cy) \
	:"cc")

	/******************************************************************************/
	#elif defined(PSTM_X86_64)
	/* x86-64 optimized */
	#if !defined(__GNUC__) \|\| !defined(__x86_64__) \|\| !defined(PSTM_64BIT)
	#error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor"
	#endif
	//#pragma message ("Using 64 bit x86_64 Assembly Optimizations")

	#define MONT_START
	#define MONT_FINI
	#define LOOP_END
	#define LOOP_START \
	mu = c[x] * mp

	#define INNERMUL \
	asm( \
	"movq %5,%%rax \n\t" \
	"mulq %4 \n\t" \
	"addq %1,%%rax \n\t" \
	"adcq $0,%%rdx \n\t" \
	"addq %%rax,%0 \n\t" \
	"adcq $0,%%rdx \n\t" \
	"movq %%rdx,%1 \n\t" \
	:"=g"(_c[LO]), "=r"(cy) \
	:"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \
	: "%rax", "%rdx", "cc")

	#define INNERMUL8 \
	asm( \
	"movq 0(%5),%%rax \n\t" \
	"movq 0(%2),%%r10 \n\t" \
	"movq 0x8(%5),%%r11 \n\t" \
	"mulq %4 \n\t" \
	"addq %%r10,%%rax \n\t" \
	"adcq $0,%%rdx \n\t" \
	"movq 0x8(%2),%%r10 \n\t" \
	"addq %3,%%rax \n\t" \
	"adcq $0,%%rdx \n\t" \
	"movq %%rax,0(%0) \n\t" \
	"movq %%rdx,%1 \n\t" \
	\
	"movq %%r11,%%rax \n\t" \
	"movq 0x10(%5),%%r11 \n\t" \
	"mulq %4 \n\t" \
	"addq %%r10,%%rax \n\t" \
	"adcq $0,%%rdx \n\t" \
	"movq 0x10(%2),%%r10 \n\t" \
	"addq %3,%%rax \n\t" \
	"adcq $0,%%rdx \n\t" \
	"movq %%rax,0x8(%0) \n\t" \
	"movq %%rdx,%1 \n\t" \
	\
	"movq %%r11,%%rax \n\t" \
	"movq 0x18(%5),%%r11 \n\t" \
	"mulq %4 \n\t" \
	"addq %%r10,%%rax \n\t" \
	"adcq $0,%%rdx \n\t" \
	"movq 0x18(%2),%%r10 \n\t" \
	"addq %3,%%rax \n\t" \
	"adcq $0,%%rdx \n\t" \
	"movq %%rax,0x10(%0) \n\t" \
	"movq %%rdx,%1 \n\t" \
	\
	"movq %%r11,%%rax \n\t" \
	"movq 0x20(%5),%%r11 \n\t" \
	"mulq %4 \n\t" \
	"addq %%r10,%%rax \n\t" \
	"adcq $0,%%rdx \n\t" \
	"movq 0x20(%2),%%r10 \n\t" \
	"addq %3,%%rax \n\t" \
	"adcq $0,%%rdx \n\t" \
	"movq %%rax,0x18(%0) \n\t" \
	"movq %%rdx,%1 \n\t" \
	\
	"movq %%r11,%%rax \n\t" \
	"movq 0x28(%5),%%r11 \n\t" \
	"mulq %4 \n\t" \
	"addq %%r10,%%rax \n\t" \
	"adcq $0,%%rdx \n\t" \
	"movq 0x28(%2),%%r10 \n\t" \
	"addq %3,%%rax \n\t" \
	"adcq $0,%%rdx \n\t" \
	"movq %%rax,0x20(%0) \n\t" \
	"movq %%rdx,%1 \n\t" \
	\
	"movq %%r11,%%rax \n\t" \
	"movq 0x30(%5),%%r11 \n\t" \
	"mulq %4 \n\t" \
	"addq %%r10,%%rax \n\t" \
	"adcq $0,%%rdx \n\t" \
	"movq 0x30(%2),%%r10 \n\t" \
	"addq %3,%%rax \n\t" \
	"adcq $0,%%rdx \n\t" \
	"movq %%rax,0x28(%0) \n\t" \
	"movq %%rdx,%1 \n\t" \
	\
	"movq %%r11,%%rax \n\t" \
	"movq 0x38(%5),%%r11 \n\t" \
	"mulq %4 \n\t" \
	"addq %%r10,%%rax \n\t" \
	"adcq $0,%%rdx \n\t" \
	"movq 0x38(%2),%%r10 \n\t" \
	"addq %3,%%rax \n\t" \
	"adcq $0,%%rdx \n\t" \
	"movq %%rax,0x30(%0) \n\t" \
	"movq %%rdx,%1 \n\t" \
	\
	"movq %%r11,%%rax \n\t" \
	"mulq %4 \n\t" \
	"addq %%r10,%%rax \n\t" \
	"adcq $0,%%rdx \n\t" \
	"addq %3,%%rax \n\t" \
	"adcq $0,%%rdx \n\t" \
	"movq %%rax,0x38(%0) \n\t" \
	"movq %%rdx,%1 \n\t" \
	\
	:"=r"(_c), "=r"(cy) \
	: "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\
	: "%rax", "%rdx", "%r10", "%r11", "cc")

	#define PROPCARRY \
	asm( \
	"addq %1,%0 \n\t" \
	"setb %%al \n\t" \
	"movzbq %%al,%1 \n\t" \
	:"=g"(_c[LO]), "=r"(cy) \
	:"0"(_c[LO]), "1"(cy) \
	: "%rax", "cc")

	/******************************************************************************/
	#elif defined(PSTM_ARM)

	#define MONT_START
	#define MONT_FINI
	#define LOOP_END
	#define LOOP_START \
	mu = c[x] * mp

	#ifdef __thumb2__
	//#pragma message ("Using 32 bit ARM Thumb2 Assembly Optimizations")
	#define INNERMUL \
	asm( \
	" LDR r0,%1 \n\t" \
	" ADDS r0,r0,%0 \n\t" \
	" ITE CS \n\t" \
	" MOVCS %0,#1 \n\t" \
	" MOVCC %0,#0 \n\t" \
	" UMLAL r0,%0,%3,%4 \n\t" \
	" STR r0,%1 \n\t" \
	:"=r"(cy),"=m"(_c[0])\
	:"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0])\
	:"r0","cc");
	#define PROPCARRY \
	asm( \
	" LDR r0,%1 \n\t" \
	" ADDS r0,r0,%0 \n\t" \
	" STR r0,%1 \n\t" \
	" ITE CS \n\t" \
	" MOVCS %0,#1 \n\t" \
	" MOVCC %0,#0 \n\t" \
	:"=r"(cy),"=m"(_c[0])\
	:"0"(cy),"m"(_c[0])\
	:"r0","cc");
	#else /* Non-Thumb2 code */
	//#pragma message ("Using 32 bit ARM Assembly Optimizations")
	#define INNERMUL \
	asm( \
	" LDR r0,%1 \n\t" \
	" ADDS r0,r0,%0 \n\t" \
	" MOVCS %0,#1 \n\t" \
	" MOVCC %0,#0 \n\t" \
	" UMLAL r0,%0,%3,%4 \n\t" \
	" STR r0,%1 \n\t" \
	:"=r"(cy),"=m"(_c[0])\
	:"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0])\
	:"r0","cc");
	#define PROPCARRY \
	asm( \
	" LDR r0,%1 \n\t" \
	" ADDS r0,r0,%0 \n\t" \
	" STR r0,%1 \n\t" \
	" MOVCS %0,#1 \n\t" \
	" MOVCC %0,#0 \n\t" \
	:"=r"(cy),"=m"(_c[0])\
	:"0"(cy),"m"(_c[0])\
	:"r0","cc");
	#endif /* __thumb2__ */


	/******************************************************************************/
	#elif defined(PSTM_MIPS)
	/* MIPS32 */
	//#pragma message ("Using 32 bit MIPS Assembly Optimizations")
	#define MONT_START
	#define MONT_FINI
	#define LOOP_END
	#define LOOP_START \
	mu = c[x] * mp

	#define INNERMUL \
	asm( \
	" multu %3,%4 \n\t" \
	" mflo $12 \n\t" \
	" mfhi $13 \n\t" \
	" addu $12,$12,%0 \n\t" \
	" sltu $10,$12,%0 \n\t" \
	" addu $13,$13,$10 \n\t" \
	" lw $10,%1 \n\t" \
	" addu $12,$12,$10 \n\t" \
	" sltu $10,$12,$10 \n\t" \
	" addu %0,$13,$10 \n\t" \
	" sw $12,%1 \n\t" \
	:"=r"(cy),"=m"(_c[0])\
	:"r"(cy),"r"(mu),"r"(tmpm[0]),"r"(_c[0])\
	:"$10","$12","$13")\
	; ++tmpm;

	#define PROPCARRY \
	asm( \
	" lw $10,%1 \n\t" \
	" addu $10,$10,%0 \n\t" \
	" sw $10,%1 \n\t" \
	" sltu %0,$10,%0 \n\t" \
	:"=r"(cy),"=m"(_c[0])\
	:"r"(cy),"r"(_c[0])\
	:"$10");


	/******************************************************************************/
	#else

	/* ISO C code */
	#define MONT_START
	#define MONT_FINI
	#define LOOP_END
	#define LOOP_START \
	mu = c[x] * mp

	#define INNERMUL \
	do { pstm_word t; \
	t = ((pstm_word)_c[0] + (pstm_word)cy) + \
	(((pstm_word)mu) * ((pstm_word)*tmpm++)); \
	_c[0] = (pstm_digit)t; \
	cy = (pstm_digit)(t >> DIGIT_BIT); \
	} while (0)

	#define PROPCARRY \
	do { pstm_digit t = _c[0] += cy; cy = (t < cy); } while (0)

	#endif

	/******************************************************************************/

	#define LO 0

	/* computes x/R == x (mod N) via Montgomery Reduction */
	int32 FAST_FUNC pstm_montgomery_reduce(psPool_t pool, pstm_int a, pstm_int *m,
	pstm_digit mp, pstm_digit *paD, uint32 paDlen)
	{
	pstm_digit c, _c, *tmpm, mu;
	int32 oldused, x, y;
	int pa; //bbox: was int16

	pa = m->used;
	if (pa > a->alloc) {
	/* Sanity test for bad numbers. This will confirm no buffer overruns */
	return PS_LIMIT_FAIL;
	}

	if (paD && paDlen >= (uint32)2*pa+1) {
	c = paD;
	memset(c, 0x0, paDlen);
	} else {
	c = xzalloc(2*pa+1);//bbox
	}
	/* copy the input */
	oldused = a->used;
	for (x = 0; x < oldused; x++) {
	c[x] = a->dp[x];
	}

	MONT_START;

	for (x = 0; x < pa; x++) {
	pstm_digit cy = 0;
	/* get Mu for this round */
	LOOP_START;
	_c = c + x;
	tmpm = m->dp;
	y = 0;
	#ifdef PSTM_X86_64
	for (; y < (pa & ~7); y += 8) {
	INNERMUL8;
	_c += 8;
	tmpm += 8;
	}
	#endif /* PSTM_X86_64 */
	for (; y < pa; y++) {
	INNERMUL;
	++_c;
	}
	LOOP_END;
	while (cy) {
	PROPCARRY;
	++_c;
	}
	}

	/* now copy out */
	_c = c + pa;
	tmpm = a->dp;
	for (x = 0; x < pa+1; x++) {
	tmpm++ = _c++;
	}

	for (; x < oldused; x++) {
	*tmpm++ = 0;
	}

	MONT_FINI;

	a->used = pa+1;
	pstm_clamp(a);

	/* reuse x as return code */
	x = PSTM_OKAY;

	/* if A >= m then A = A - m */
	if (pstm_cmp_mag (a, m) != PSTM_LT) {
	if (s_pstm_sub (a, m, a) != PSTM_OKAY) {
	x = PS_MEM_FAIL;
	}
	}
	if (paDlen < (uint32)2*pa+1) {
	psFree(c, pool);
	}
	return x;
	}

	#endif /* !DISABLE_PSTM */
	/******************************************************************************/