tls: format and send CLIENT_KEY_EXCHANGE

$ ./busybox tls kernel.org
insize:0 tail:0
got block len:74
got HANDSHAKE
got SERVER_HELLO
insize:79 tail:4265
got block len:4392
got HANDSHAKE
got CERTIFICATE
entered der @0x8b217a7:0x30 len:1452 inner_byte @0x8b217ab:0x30
entered der @0x8b217ab:0x30 len:1172 inner_byte @0x8b217af:0xa0
skipped der 0xa0, next byte 0x02
skipped der 0x02, next byte 0x30
skipped der 0x30, next byte 0x30
skipped der 0x30, next byte 0x30
skipped der 0x30, next byte 0x30
skipped der 0x30, next byte 0x30
entered der @0x8b218b4:0x30 len:418 inner_byte @0x8b218b8:0x30
skipped der 0x30, next byte 0x03
entered der @0x8b218c7:0x03 len:399 inner_byte @0x8b218cb:0x00
key bytes:399, first:0x00
entered der @0x8b218cc:0x30 len:394 inner_byte @0x8b218d0:0x02
binary bytes:385, first:0x00
skipped der 0x02, next byte 0x02
binary bytes:3, first:0x01
server_rsa_pub_key.size:384
insize:4397 tail:9
got block len:4
got SERVER_HELLO_DONE
insize:9 tail:0
^C

Next step: send CHANGE_CIPHER_SPEC... and actually implement it.

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
diff --git a/networking/tls.c b/networking/tls.c
index 69c81b5..b0a4f7e 100644
--- a/networking/tls.c
+++ b/networking/tls.c
@@ -1,7 +1,7 @@
 /*
- * Licensed under GPLv2, see file LICENSE in this source tree.
- *
  * Copyright (C) 2017 Denys Vlasenko
+ *
+ * Licensed under GPLv2, see file LICENSE in this source tree.
  */
 //config:config TLS
 //config:	bool "tls (debugging)"
@@ -10,6 +10,11 @@
 //applet:IF_TLS(APPLET(tls, BB_DIR_USR_BIN, BB_SUID_DROP))
 
 //kbuild:lib-$(CONFIG_TLS) += tls.o
+//kbuild:lib-$(CONFIG_TLS) += tls_pstm.o
+//kbuild:lib-$(CONFIG_TLS) += tls_pstm_montgomery_reduce.o
+//kbuild:lib-$(CONFIG_TLS) += tls_pstm_mul_comba.o
+//kbuild:lib-$(CONFIG_TLS) += tls_pstm_sqr_comba.o
+//kbuild:lib-$(CONFIG_TLS) += tls_rsa.o
 ////kbuild:lib-$(CONFIG_TLS) += tls_ciphers.o
 ////kbuild:lib-$(CONFIG_TLS) += tls_aes.o
 ////kbuild:lib-$(CONFIG_TLS) += tls_aes_gcm.o
@@ -18,9 +23,7 @@
 //usage:       "HOST[:PORT]"
 //usage:#define tls_full_usage "\n\n"
 
-#include "libbb.h"
-//#include "tls_cryptoapi.h"
-//#include "tls_ciphers.h"
+#include "tls.h"
 
 #if 1
 # define dbg(...) fprintf(stderr, __VA_ARGS__)
@@ -28,23 +31,26 @@
 # define dbg(...) ((void)0)
 #endif
 
-#define RECORD_TYPE_CHANGE_CIPHER_SPEC 20
-#define RECORD_TYPE_ALERT              21
-#define RECORD_TYPE_HANDSHAKE          22
-#define RECORD_TYPE_APPLICATION_DATA   23
+#define RECORD_TYPE_CHANGE_CIPHER_SPEC  20
+#define RECORD_TYPE_ALERT               21
+#define RECORD_TYPE_HANDSHAKE           22
+#define RECORD_TYPE_APPLICATION_DATA    23
 
-#define HANDSHAKE_HELLO_REQUEST        0
-#define HANDSHAKE_CLIENT_HELLO         1
-#define HANDSHAKE_SERVER_HELLO         2
-#define HANDSHAKE_HELLO_VERIFY_REQUEST 3
-#define HANDSHAKE_NEW_SESSION_TICKET   4
-#define HANDSHAKE_CERTIFICATE          11
-#define HANDSHAKE_SERVER_KEY_EXCHANGE  12
-#define HANDSHAKE_CERTIFICATE_REQUEST  13
-#define HANDSHAKE_SERVER_HELLO_DONE    14
-#define HANDSHAKE_CERTIFICATE_VERIFY   15
-#define HANDSHAKE_CLIENT_KEY_EXCHANGE  16
-#define HANDSHAKE_FINISHED             20
+#define HANDSHAKE_HELLO_REQUEST         0
+#define HANDSHAKE_CLIENT_HELLO          1
+#define HANDSHAKE_SERVER_HELLO          2
+#define HANDSHAKE_HELLO_VERIFY_REQUEST  3
+#define HANDSHAKE_NEW_SESSION_TICKET    4
+#define HANDSHAKE_CERTIFICATE           11
+#define HANDSHAKE_SERVER_KEY_EXCHANGE   12
+#define HANDSHAKE_CERTIFICATE_REQUEST   13
+#define HANDSHAKE_SERVER_HELLO_DONE     14
+#define HANDSHAKE_CERTIFICATE_VERIFY    15
+#define HANDSHAKE_CLIENT_KEY_EXCHANGE   16
+#define HANDSHAKE_FINISHED              20
+
+#define SSL_HS_RANDOM_SIZE              32
+#define SSL_HS_RSA_PREMASTER_SIZE       48
 
 #define SSL_NULL_WITH_NULL_NULL                 0x0000
 #define SSL_RSA_WITH_NULL_MD5                   0x0001
@@ -112,6 +118,7 @@
 //TLS 1.2
 #define TLS_MAJ 3
 #define TLS_MIN 3
+//#define CIPHER_ID TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA // ok, recvs SERVER_KEY_EXCHANGE *** matrixssl uses this on my box
 //#define CIPHER_ID TLS_RSA_WITH_AES_256_CBC_SHA256 // ok, no SERVER_KEY_EXCHANGE
 // All GCMs:
 //#define CIPHER_ID TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384 // SSL_ALERT_HANDSHAKE_FAILURE
@@ -123,9 +130,9 @@
 //#define CIPHER_ID TLS_ECDH_RSA_WITH_AES_256_GCM_SHA384
 //#define CIPHER_ID TLS_ECDH_RSA_WITH_AES_128_GCM_SHA256 // SSL_ALERT_HANDSHAKE_FAILURE
 //#define CIPHER_ID TLS_RSA_WITH_AES_256_GCM_SHA384 // ok, no SERVER_KEY_EXCHANGE
-#define CIPHER_ID TLS_RSA_WITH_AES_128_GCM_SHA256 // ok, no SERVER_KEY_EXCHANGE
+#define CIPHER_ID TLS_RSA_WITH_AES_128_GCM_SHA256 // ok, no SERVER_KEY_EXCHANGE *** select this?
 //#define CIPHER_ID TLS_DH_anon_WITH_AES_256_CBC_SHA // SSL_ALERT_HANDSHAKE_FAILURE
-// (tested b/c this one doesn't req server certs... no luck)
+//^^^^^^^^^^^^^^^^^^^^^^^ (tested b/c this one doesn't req server certs... no luck)
 //test TLS_RSA_WITH_AES_128_CBC_SHA, in tls 1.2 it's mandated to be always supported
 
 struct record_hdr {
@@ -137,8 +144,7 @@
 typedef struct tls_state {
 	int fd;
 
-	uint8_t *pubkey;
-	int pubkey_len;
+	psRsaKey_t server_rsa_pub_key;
 
 	// RFC 5246
 	// |6.2.1. Fragmentation
@@ -170,6 +176,12 @@
 	uint8_t inbuf[18*1024];
 } tls_state_t;
 
+void tls_get_random(void *buf, unsigned len)
+{
+	if (len != open_read_close("/dev/urandom", buf, len))
+		xfunc_die();
+}
+
 static
 tls_state_t *new_tls_state(void)
 {
@@ -286,7 +298,7 @@
 	hello.len24_lo  = (sizeof(hello) - sizeof(hello.xhdr) - 4);
 	hello.proto_maj = TLS_MAJ;
 	hello.proto_min = TLS_MIN;
-	open_read_close("/dev/urandom", hello.rand32, sizeof(hello.rand32));
+	tls_get_random(hello.rand32, sizeof(hello.rand32));
 	//hello.session_id_len = 0;
 	//hello.cipherid_len16_hi = 0;
 	hello.cipherid_len16_lo = 2 * 1;
@@ -407,7 +419,18 @@
 	return new_der;
 }
 
-static void *find_key_in_der_cert(int *key_len, uint8_t *der, int len)
+static void der_binary_to_pstm(pstm_int *pstm_n, uint8_t *der, uint8_t *end)
+{
+        uint8_t *bin_ptr;
+        unsigned len = get_der_len(&bin_ptr, der, end);
+
+	dbg("binary bytes:%u, first:0x%02x\n", len, bin_ptr[0]);
+	pstm_init_for_read_unsigned_bin(/*pool:*/ NULL, pstm_n, len);
+	pstm_read_unsigned_bin(pstm_n, bin_ptr, len);
+	//return bin + len;
+}
+
+static void find_key_in_der_cert(tls_state_t *tls, uint8_t *der, int len)
 {
 /* Certificate is a DER-encoded data structure. Each DER element has a length,
  * which makes it easy to skip over large compound elements of any complexity
@@ -504,19 +527,43 @@
 	der = skip_der_item(der, end); /* validity */
 	der = skip_der_item(der, end); /* subject */
 
-	/* enter "subjectPublicKeyInfo" */
+	/* enter subjectPublicKeyInfo */
 	der = enter_der_item(der, &end);
-
-	/* skip "subjectPublicKeyInfo.algorithm" */
+	{ /* check subjectPublicKeyInfo.algorithm */
+		static const uint8_t expected[] = {
+			0x30,0x0d, // SEQ 13 bytes
+			0x06,0x09, 0x2a,0x86,0x48,0x86,0xf7,0x0d,0x01,0x01,0x01, // OID RSA_KEY_ALG 42.134.72.134.247.13.1.1.1
+			//0x05,0x00, // NULL
+		};
+		if (memcmp(der, expected, sizeof(expected)) != 0)
+			bb_error_msg_and_die("not RSA key");
+	}
+	/* skip subjectPublicKeyInfo.algorithm */
 	der = skip_der_item(der, end);
-	/* enter "subjectPublicKeyInfo.publicKey" */
+	/* enter subjectPublicKeyInfo.publicKey */
 //	die_if_not_this_der_type(der, end, 0x03); /* must be BITSTRING */
 	der = enter_der_item(der, &end);
 
-	/* return a copy */
-	*key_len = end - der;
-	dbg("copying key bytes:%u, first:0x%02x\n", *key_len, der[0]);
-	return xmemdup(der, *key_len);
+	/* parse RSA key: */
+//based on getAsnRsaPubKey(), pkcs1ParsePrivBin() is also of note
+	dbg("key bytes:%u, first:0x%02x\n", (int)(end - der), der[0]);
+	if (end - der < 14) xfunc_die();
+	/* example format:
+	 * ignore bits: 00
+	 * SEQ 0x018a/394 bytes: 3082018a
+	 *   INTEGER 0x0181/385 bytes (modulus): 02820181 XX...XXX
+	 *   INTEGER 3 bytes (exponent): 0203 010001
+	 */
+	if (*der != 0) /* "ignore bits", should be 0 */
+		xfunc_die();
+	der++;
+	der = enter_der_item(der, &end); /* enter SEQ */
+	//memset(tls->server_rsa_pub_key, 0, sizeof(tls->server_rsa_pub_key));
+	der_binary_to_pstm(&tls->server_rsa_pub_key.N, der, end); /* modulus */
+	der = skip_der_item(der, end);
+	der_binary_to_pstm(&tls->server_rsa_pub_key.e, der, end); /* exponent */
+	tls->server_rsa_pub_key.size = pstm_unsigned_bin_size(&tls->server_rsa_pub_key.N);
+	dbg("server_rsa_pub_key.size:%d\n", tls->server_rsa_pub_key.size);
 }
 
 static void get_server_cert_or_die(tls_state_t *tls)
@@ -553,7 +600,107 @@
 	len = len1;
 
 	if (len)
-		tls->pubkey = find_key_in_der_cert(&tls->pubkey_len, certbuf + 10, len);
+		find_key_in_der_cert(tls, certbuf + 10, len);
+}
+
+static void send_client_key_exchange(tls_state_t *tls)
+{
+#if 0 //matrixssl code snippets:
+	int32 csRsaEncryptPub(psPool_t *pool, psPubKey_t *key,
+	                        unsigned char *in, uint32 inlen, unsigned char *out, uint32 outlen,
+	                        void *data)
+	{
+	        psAssert(key->type == PS_RSA);
+	        return psRsaEncryptPub(pool, (psRsaKey_t*)key->key, in, inlen, out, outlen,
+	                        data);
+	}
+...
+	/* pkaAfter.user is buffer len */
+	if ((rc = csRsaEncryptPub(pka->pool, &ssl->sec.cert->publicKey,
+			ssl->sec.premaster,	ssl->sec.premasterSize, pka->outbuf,
+			pka->user, pka->data)) < 0) {
+		if (rc == PS_PENDING) {
+			/* For these ClientKeyExchange paths, we do want to come
+				back through nowDoCkePka for a double pass so each
+				case can manage its own pkaAfter and to make sure
+				psX509FreeCert and sslCreateKeys() are hit below. */
+			return rc;
+		}
+		psTraceIntInfo("csRsaEncryptPub in CKE failed %d\n", rc);
+		return MATRIXSSL_ERROR;
+	}
+	/* RSA closed the pool on second pass */
+	pka->pool = NULL;
+	clearPkaAfter(ssl);
+...
+#ifdef USE_RSA_CIPHER_SUITE
+/*
+			Standard RSA suite
+*/
+			ssl->sec.premasterSize = SSL_HS_RSA_PREMASTER_SIZE;
+			ssl->sec.premaster = psMalloc(ssl->hsPool,
+									SSL_HS_RSA_PREMASTER_SIZE);
+			if (ssl->sec.premaster == NULL) {
+				return SSL_MEM_ERROR;
+			}
+
+			ssl->sec.premaster[0] = ssl->reqMajVer;
+			ssl->sec.premaster[1] = ssl->reqMinVer;
+			if (matrixCryptoGetPrngData(ssl->sec.premaster + 2,
+					SSL_HS_RSA_PREMASTER_SIZE - 2, ssl->userPtr) < 0) {
+				return MATRIXSSL_ERROR;
+			}
+
+			/* Shedule RSA encryption.  Put tmp pool under control of After */
+			pkaAfter->type = PKA_AFTER_RSA_ENCRYPT;
+			pkaAfter->outbuf = c;
+			pkaAfter->data = pkiData;
+			pkaAfter->pool = pkiPool;
+			pkaAfter->user = (uint32)(end - c); /* Available space */
+
+			c += keyLen;
+#endif
+#endif // 0
+
+	struct client_key_exchange {
+		struct record_hdr xhdr;
+		uint8_t type;
+		uint8_t len24_hi, len24_mid, len24_lo;
+		uint8_t keylen16_hi, keylen16_lo; /* exist for RSA, but not for some other key types */
+//had a bug when had no keylen: we:
+//write(3, "\x16\x03\x03\x01\x84\x10\x00\x01\x80\xXX\xXX\xXX\xXX\xXX\xXX...", 393) = 393
+//openssl:
+//write to 0xe9a090 [0xf9ac20] (395 bytes => 395 (0x18B))
+//0000 -      16  03  03  01  86  10  00  01 -82  01  80  xx  xx  xx  xx  xx
+		uint8_t key[384]; // size??
+	};
+	struct client_key_exchange record;
+	uint8_t premaster[SSL_HS_RSA_PREMASTER_SIZE];
+
+	memset(&record, 0, sizeof(record));
+	record.xhdr.type = RECORD_TYPE_HANDSHAKE;
+	record.xhdr.proto_maj = TLS_MAJ;
+	record.xhdr.proto_min = TLS_MIN;
+	record.xhdr.len16_hi = (sizeof(record) - sizeof(record.xhdr)) >> 8;
+	record.xhdr.len16_lo = (sizeof(record) - sizeof(record.xhdr)) & 0xff;
+	record.type = HANDSHAKE_CLIENT_KEY_EXCHANGE;
+	//record.len24_hi  = 0;
+	record.len24_mid = (sizeof(record) - sizeof(record.xhdr) - 4) >> 8;
+	record.len24_lo  = (sizeof(record) - sizeof(record.xhdr) - 4) & 0xff;
+	record.keylen16_hi = (sizeof(record) - sizeof(record.xhdr) - 6) >> 8;
+	record.keylen16_lo = (sizeof(record) - sizeof(record.xhdr) - 6) & 0xff;
+
+	tls_get_random(premaster, sizeof(premaster));
+	premaster[0] = TLS_MAJ;
+	premaster[1] = TLS_MIN;
+	psRsaEncryptPub(/*pool:*/ NULL,
+		/* psRsaKey_t* */ &tls->server_rsa_pub_key,
+		premaster, /*inlen:*/ sizeof(premaster),
+		record.key, sizeof(record.key),
+		data_param_ignored
+	);
+
+	xwrite(tls->fd, &record, sizeof(record));
 }
 
 static void tls_handshake(tls_state_t *tls)
@@ -614,6 +761,8 @@
 		// 459 bytes:
 		// 0c   00|01|c7 03|00|17|41|04|87|94|2e|2f|68|d0|c9|f4|97|a8|2d|ef|ed|67|ea|c6|f3|b3|56|47|5d|27|b6|bd|ee|70|25|30|5e|b0|8e|f6|21|5a...
 		//SvKey len=455^
+		// with TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA: 461 bytes:
+		// 0c   00|01|c9 03|00|17|41|04|cd|9b|b4|29|1f|f6|b0|c2|84|82|7f|29|6a|47|4e|ec|87|0b|c1|9c|69|e1|f8|c6|d0|53|e9|27|90|a5|c8|02|15|75...
 		dbg("got SERVER_KEY_EXCHANGE\n");
 		len = xread_tls_block(tls);
 		break;
@@ -624,6 +773,8 @@
 	case HANDSHAKE_SERVER_HELLO_DONE:
 		// 0e 000000 (len:0)
 		dbg("got SERVER_HELLO_DONE\n");
+		send_client_key_exchange(tls);
+		len = xread_tls_block(tls);
 		break;
 	default:
 		tls_error_die(tls);
diff --git a/networking/tls.h b/networking/tls.h
new file mode 100644
index 0000000..20317ec
--- /dev/null
+++ b/networking/tls.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2017 Denys Vlasenko
+ *
+ * Licensed under GPLv2, see file LICENSE in this source tree.
+ */
+#include "libbb.h"
+
+/* config tweaks */
+#define HAVE_NATIVE_INT64 1
+#undef  DISABLE_PSTM
+#undef  USE_1024_KEY_SPEED_OPTIMIZATIONS
+#undef  USE_2048_KEY_SPEED_OPTIMIZATIONS
+//TODO: enable to use asm:
+//#if defined(__GNUC__) && defined(__i386__)   -> #define PSTM_32BIT and PSTM_X86
+//#if defined(__GNUC__) && defined(__x86_64__) -> #define PSTM_64BIT and PSTM_X86_64
+//ARM and MIPS also have these
+
+
+#define PS_SUCCESS              0
+#define PS_FAILURE              -1
+#define PS_ARG_FAIL             -6      /* Failure due to bad function param */
+#define PS_PLATFORM_FAIL        -7      /* Failure as a result of system call error */
+#define PS_MEM_FAIL             -8      /* Failure to allocate requested memory */
+#define PS_LIMIT_FAIL           -9      /* Failure on sanity/limit tests */
+
+#define PS_TRUE         1
+#define PS_FALSE        0
+
+#if BB_BIG_ENDIAN
+# define ENDIAN_BIG     1
+# undef  ENDIAN_LITTLE
+//#????  ENDIAN_32BITWORD
+// controls only STORE32L, which we don't use
+#else
+# define ENDIAN_LITTLE  1
+# undef  ENDIAN_BIG
+#endif
+
+typedef uint64_t uint64;
+typedef  int64_t  int64;
+typedef uint32_t uint32;
+typedef  int32_t  int32;
+typedef uint16_t uint16;
+typedef  int16_t  int16;
+
+//FIXME
+typedef char psPool_t;
+
+//#ifdef PS_PUBKEY_OPTIMIZE_FOR_SMALLER_RAM
+#define PS_EXPTMOD_WINSIZE   3
+//#ifdef PS_PUBKEY_OPTIMIZE_FOR_FASTER_SPEED
+//#define PS_EXPTMOD_WINSIZE 5
+
+#define PUBKEY_TYPE     0x01
+#define PRIVKEY_TYPE    0x02
+
+void tls_get_random(void *buf, unsigned len);
+
+#define matrixCryptoGetPrngData(buf, len, userPtr) (tls_get_random(buf, len), PS_SUCCESS)
+
+#define psFree(p, pool)    free(p)
+#define psTraceCrypto(msg) bb_error_msg_and_die(msg)
+
+/* Secure zerofill */
+#define memset_s(A,B,C,D) memset((A),(C),(D))
+/* Constant time memory comparison */
+#define memcmpct(s1, s2, len) memcmp((s1), (s2), (len))
+#undef min
+#define min(x, y) ((x) < (y) ? (x) : (y))
+
+
+#include "tls_pstm.h"
+#include "tls_rsa.h"
diff --git a/networking/tls_pstm.c b/networking/tls_pstm.c
new file mode 100644
index 0000000..0d797f8
--- /dev/null
+++ b/networking/tls_pstm.c
@@ -0,0 +1,2254 @@
+/*
+ * Copyright (C) 2017 Denys Vlasenko
+ *
+ * Licensed under GPLv2, see file LICENSE in this source tree.
+ */
+#include "tls.h"
+
+/**
+ *	@file    pstm.c
+ *	@version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
+ *
+ *	Multiprecision number implementation.
+ */
+/*
+ *	Copyright (c) 2013-2015 INSIDE Secure Corporation
+ *	Copyright (c) PeerSec Networks, 2002-2011
+ *	All Rights Reserved
+ *
+ *	The latest version of this code is available at http://www.matrixssl.org
+ *
+ *	This software is open source; you can redistribute it and/or modify
+ *	it under the terms of the GNU General Public License as published by
+ *	the Free Software Foundation; either version 2 of the License, or
+ *	(at your option) any later version.
+ *
+ *	This General Public License does NOT permit incorporating this software
+ *	into proprietary programs.  If you are unable to comply with the GPL, a
+ *	commercial license for this software may be purchased from INSIDE at
+ *	http://www.insidesecure.com/eng/Company/Locations
+ *
+ *	This program is distributed in WITHOUT ANY WARRANTY; without even the
+ *	implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *	See the GNU General Public License for more details.
+ *
+ *	You should have received a copy of the GNU General Public License
+ *	along with this program; if not, write to the Free Software
+ *	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *	http://www.gnu.org/copyleft/gpl.html
+ */
+/******************************************************************************/
+
+///bbox
+//#include "../cryptoApi.h"
+#ifndef DISABLE_PSTM
+
+static int32 pstm_mul_2d(pstm_int *a, int16 b, pstm_int *c);
+
+/******************************************************************************/
+/*
+	init an pstm_int for a given size
+ */
+int32 pstm_init_size(psPool_t *pool, pstm_int * a, uint32 size)
+{
+//	uint16		x;
+
+/*
+	alloc mem
+ */
+	a->dp = xzalloc(sizeof (pstm_digit) * size);
+	a->pool = pool;
+	a->used  = 0;
+	a->alloc = (int16)size;
+	a->sign  = PSTM_ZPOS;
+/*
+	zero the digits
+ */
+///bbox
+//	for (x = 0; x < size; x++) {
+//		a->dp[x] = 0;
+//	}
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+	Init a new pstm_int.
+*/
+int32 pstm_init(psPool_t *pool, pstm_int * a)
+{
+//	int32		i;
+/*
+	allocate memory required and clear it
+ */
+	a->dp = xzalloc(sizeof (pstm_digit) * PSTM_DEFAULT_INIT);
+/*
+	set the digits to zero
+ */
+///bbox
+//	for (i = 0; i < PSTM_DEFAULT_INIT; i++) {
+//		a->dp[i] = 0;
+//	}
+/*
+	set the used to zero, allocated digits to the default precision and sign
+	to positive
+ */
+	a->pool = pool;
+	a->used  = 0;
+	a->alloc = PSTM_DEFAULT_INIT;
+	a->sign  = PSTM_ZPOS;
+
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+	Grow as required
+ */
+int32 pstm_grow(pstm_int * a, int16 size)
+{
+	int16			i;
+	pstm_digit		*tmp;
+
+/*
+	If the alloc size is smaller alloc more ram.
+ */
+	if (a->alloc < size) {
+/*
+		Reallocate the array a->dp
+
+		We store the return in a temporary variable in case the operation
+		failed we don't want to overwrite the dp member of a.
+*/
+		tmp = xrealloc(a->dp, sizeof (pstm_digit) * size);
+/*
+		reallocation succeeded so set a->dp
+ */
+		a->dp = tmp;
+/*
+		zero excess digits
+ */
+		i			= a->alloc;
+		a->alloc	= size;
+		for (; i < a->alloc; i++) {
+			a->dp[i] = 0;
+		}
+	}
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+	copy, b = a (b must be pre-allocated)
+ */
+int32 pstm_copy(pstm_int * a, pstm_int * b)
+{
+	int32	res, n;
+
+/*
+	If dst == src do nothing
+ */
+	if (a == b) {
+		return PSTM_OKAY;
+	}
+/*
+	Grow dest
+ */
+	if (b->alloc < a->used) {
+		if ((res = pstm_grow (b, a->used)) != PSTM_OKAY) {
+			return res;
+		}
+	}
+/*
+	Zero b and copy the parameters over
+ */
+	{
+		register pstm_digit *tmpa, *tmpb;
+
+		/* pointer aliases */
+		/* source */
+		tmpa = a->dp;
+
+		/* destination */
+		tmpb = b->dp;
+
+		/* copy all the digits */
+		for (n = 0; n < a->used; n++) {
+			*tmpb++ = *tmpa++;
+		}
+
+		/* clear high digits */
+		for (; n < b->used; n++) {
+			*tmpb++ = 0;
+		}
+	}
+/*
+	copy used count and sign
+ */
+	b->used = a->used;
+	b->sign = a->sign;
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+	Trim unused digits
+
+	This is used to ensure that leading zero digits are trimed and the
+	leading "used" digit will be non-zero. Typically very fast.  Also fixes
+	the sign if there are no more leading digits
+*/
+void pstm_clamp(pstm_int * a)
+{
+/*	decrease used while the most significant digit is zero. */
+	while (a->used > 0 && a->dp[a->used - 1] == 0) {
+		--(a->used);
+	}
+/*	reset the sign flag if used == 0 */
+	if (a->used == 0) {
+		a->sign = PSTM_ZPOS;
+	}
+}
+
+/******************************************************************************/
+/*
+	clear one (frees).
+ */
+void pstm_clear(pstm_int * a)
+{
+	int32		i;
+/*
+	only do anything if a hasn't been freed previously
+ */
+	if (a != NULL && a->dp != NULL) {
+/*
+		first zero the digits
+ */
+		for (i = 0; i < a->used; i++) {
+			a->dp[i] = 0;
+		}
+
+		psFree (a->dp, a->pool);
+/*
+		reset members to make debugging easier
+ */
+		a->dp		= NULL;
+		a->alloc	= a->used = 0;
+		a->sign		= PSTM_ZPOS;
+	}
+}
+
+/******************************************************************************/
+/*
+	clear many (frees).
+ */
+void pstm_clear_multi(pstm_int *mp0, pstm_int *mp1, pstm_int *mp2,
+					pstm_int *mp3, pstm_int *mp4, pstm_int *mp5,
+					pstm_int *mp6, pstm_int *mp7)
+{
+	int32		n;		/* Number of ok inits */
+
+	pstm_int	*tempArray[9];
+
+	tempArray[0] = mp0;
+	tempArray[1] = mp1;
+	tempArray[2] = mp2;
+	tempArray[3] = mp3;
+	tempArray[4] = mp4;
+	tempArray[5] = mp5;
+	tempArray[6] = mp6;
+	tempArray[7] = mp7;
+	tempArray[8] = NULL;
+
+	for (n = 0; tempArray[n] != NULL; n++) {
+		if ((tempArray[n] != NULL) && (tempArray[n]->dp != NULL)) {
+			pstm_clear(tempArray[n]);
+		}
+	}
+}
+
+/******************************************************************************/
+/*
+	Set to zero.
+ */
+void pstm_zero(pstm_int * a)
+{
+	int32		n;
+	pstm_digit	*tmp;
+
+	a->sign = PSTM_ZPOS;
+	a->used = 0;
+
+	tmp = a->dp;
+	for (n = 0; n < a->alloc; n++) {
+		*tmp++ = 0;
+	}
+}
+
+
+/******************************************************************************/
+/*
+	Compare maginitude of two ints (unsigned).
+ */
+int32 pstm_cmp_mag(pstm_int * a, pstm_int * b)
+{
+	int16		n;
+	pstm_digit	*tmpa, *tmpb;
+
+/*
+	compare based on # of non-zero digits
+ */
+	if (a->used > b->used) {
+		return PSTM_GT;
+	}
+
+	if (a->used < b->used) {
+		return PSTM_LT;
+	}
+
+	/* alias for a */
+	tmpa = a->dp + (a->used - 1);
+
+	/* alias for b */
+	tmpb = b->dp + (a->used - 1);
+
+/*
+	compare based on digits
+ */
+	for (n = 0; n < a->used; ++n, --tmpa, --tmpb) {
+		if (*tmpa > *tmpb) {
+			return PSTM_GT;
+		}
+		if (*tmpa < *tmpb) {
+			return PSTM_LT;
+		}
+	}
+	return PSTM_EQ;
+}
+
+/******************************************************************************/
+/*
+	Compare two ints (signed)
+ */
+int32 pstm_cmp(pstm_int * a, pstm_int * b)
+{
+/*
+	compare based on sign
+ */
+	if (a->sign != b->sign) {
+		if (a->sign == PSTM_NEG) {
+			return PSTM_LT;
+		} else {
+			return PSTM_GT;
+		}
+	}
+/*
+	compare digits
+ */
+	if (a->sign == PSTM_NEG) {
+		/* if negative compare opposite direction */
+		return pstm_cmp_mag(b, a);
+	} else {
+		return pstm_cmp_mag(a, b);
+	}
+}
+
+/******************************************************************************/
+/*
+	pstm_ints can be initialized more precisely when they will populated
+	using pstm_read_unsigned_bin since the length of the byte stream is known
+*/
+int32 pstm_init_for_read_unsigned_bin(psPool_t *pool, pstm_int *a, uint32 len)
+{
+	int32 size;
+/*
+	Need to set this based on how many words max it will take to store the bin.
+	The magic + 2:
+		1 to round up for the remainder of this integer math
+		1 for the initial carry of '1' bits that fall between DIGIT_BIT and 8
+*/
+	size = (((len / sizeof(pstm_digit)) * (sizeof(pstm_digit) * CHAR_BIT))
+		/ DIGIT_BIT) + 2;
+	return pstm_init_size(pool, a, size);
+}
+
+
+/******************************************************************************/
+/*
+	Reads a unsigned char array into pstm_int format.  User should have
+	called pstm_init_for_read_unsigned_bin first.  There is some grow logic
+	here if the default pstm_init was used but we don't really want to hit it.
+*/
+int32 pstm_read_unsigned_bin(pstm_int *a, unsigned char *b, int32 c)
+{
+	/* zero the int */
+	pstm_zero (a);
+
+/*
+	If we know the endianness of this architecture, and we're using
+	32-bit pstm_digits, we can optimize this
+*/
+#if (defined(ENDIAN_LITTLE) || defined(ENDIAN_BIG)) && !defined(PSTM_64BIT)
+  /* But not for both simultaneously */
+#if defined(ENDIAN_LITTLE) && defined(ENDIAN_BIG)
+#error Both ENDIAN_LITTLE and ENDIAN_BIG defined.
+#endif
+	{
+		unsigned char *pd;
+		if ((unsigned)c > (PSTM_MAX_SIZE * sizeof(pstm_digit))) {
+			uint32 excess = c - (PSTM_MAX_SIZE * sizeof(pstm_digit));
+			c -= excess;
+			b += excess;
+		}
+		a->used = (int16)((c + sizeof(pstm_digit) - 1)/sizeof(pstm_digit));
+		if (a->alloc < a->used) {
+			if (pstm_grow(a, a->used) != PSTM_OKAY) {
+				return PSTM_MEM;
+			}
+		}
+		pd = (unsigned char *)a->dp;
+		/* read the bytes in */
+#ifdef ENDIAN_BIG
+		{
+			/* Use Duff's device to unroll the loop. */
+			int32 idx = (c - 1) & ~3;
+			switch (c % 4) {
+				case 0:	do { pd[idx+0] = *b++;
+					case 3:	     pd[idx+1] = *b++;
+					case 2:	     pd[idx+2] = *b++;
+					case 1:	     pd[idx+3] = *b++;
+					idx -= 4;
+				} while ((c -= 4) > 0);
+			}
+		}
+#else
+		for (c -= 1; c >= 0; c -= 1) {
+			pd[c] = *b++;
+		}
+#endif
+	}
+#else
+	/* Big enough based on the len? */
+	a->used = (((c / sizeof(pstm_digit)) * (sizeof(pstm_digit) * CHAR_BIT))
+		/ DIGIT_BIT) + 2;
+
+	if (a->alloc < a->used) {
+		if (pstm_grow(a, a->used) != PSTM_OKAY) {
+			return PSTM_MEM;
+		}
+	}
+	/* read the bytes in */
+	for (; c > 0; c--) {
+		if (pstm_mul_2d (a, 8, a) != PSTM_OKAY) {
+			return PS_MEM_FAIL;
+		}
+		a->dp[0] |= *b++;
+		a->used += 1;
+	}
+#endif
+
+	pstm_clamp (a);
+	return PS_SUCCESS;
+}
+
+/******************************************************************************/
+/*
+*/
+int16 pstm_count_bits (pstm_int * a)
+{
+	int16     r;
+	pstm_digit q;
+
+	if (a->used == 0) {
+		return 0;
+	}
+
+	/* get number of digits and add that */
+	r = (a->used - 1) * DIGIT_BIT;
+
+	/* take the last digit and count the bits in it */
+	q = a->dp[a->used - 1];
+	while (q > ((pstm_digit) 0)) {
+		++r;
+		q >>= ((pstm_digit) 1);
+	}
+	return r;
+}
+
+/******************************************************************************/
+int32 pstm_unsigned_bin_size(pstm_int *a)
+{
+	int32     size = pstm_count_bits (a);
+	return (size / 8 + ((size & 7) != 0 ? 1 : 0));
+}
+
+/******************************************************************************/
+void pstm_set(pstm_int *a, pstm_digit b)
+{
+   pstm_zero(a);
+   a->dp[0] = b;
+   a->used  = a->dp[0] ? 1 : 0;
+}
+
+/******************************************************************************/
+/*
+	Right shift
+*/
+void pstm_rshd(pstm_int *a, int16 x)
+{
+	int16 y;
+
+	/* too many digits just zero and return */
+	if (x >= a->used) {
+		pstm_zero(a);
+		return;
+	}
+
+	/* shift */
+	for (y = 0; y < a->used - x; y++) {
+		a->dp[y] = a->dp[y+x];
+	}
+
+	/* zero rest */
+	for (; y < a->used; y++) {
+		a->dp[y] = 0;
+	}
+
+	/* decrement count */
+	a->used -= x;
+	pstm_clamp(a);
+}
+
+/******************************************************************************/
+/*
+	Shift left a certain amount of digits.
+ */
+int32 pstm_lshd(pstm_int * a, int16 b)
+{
+	int16	x;
+	int32	res;
+
+/*
+	If its less than zero return.
+ */
+	if (b <= 0) {
+		return PSTM_OKAY;
+	}
+/*
+	Grow to fit the new digits.
+ */
+	if (a->alloc < a->used + b) {
+		if ((res = pstm_grow (a, a->used + b)) != PSTM_OKAY) {
+			return res;
+		}
+	}
+
+	{
+		register pstm_digit *top, *bottom;
+/*
+		Increment the used by the shift amount then copy upwards.
+ */
+		a->used += b;
+
+		/* top */
+		top = a->dp + a->used - 1;
+
+		/* base */
+		bottom = a->dp + a->used - 1 - b;
+/*
+		This is implemented using a sliding window except the window goes the
+		other way around.  Copying from the bottom to the top.
+ */
+		for (x = a->used - 1; x >= b; x--) {
+			*top-- = *bottom--;
+		}
+
+		/* zero the lower digits */
+		top = a->dp;
+		for (x = 0; x < b; x++) {
+			*top++ = 0;
+		}
+	}
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+	computes a = 2**b
+*/
+int32 pstm_2expt(pstm_int *a, int16 b)
+{
+	int16     z;
+
+   /* zero a as per default */
+	pstm_zero (a);
+
+	if (b < 0) {
+		return PSTM_OKAY;
+	}
+
+	z = b / DIGIT_BIT;
+	if (z >= PSTM_MAX_SIZE) {
+		return PS_LIMIT_FAIL;
+	}
+
+	/* set the used count of where the bit will go */
+	a->used = z + 1;
+
+	if (a->used > a->alloc) {
+		if (pstm_grow(a, a->used) != PSTM_OKAY) {
+			return PS_MEM_FAIL;
+		}
+	}
+
+	/* put the single bit in its place */
+	a->dp[z] = ((pstm_digit)1) << (b % DIGIT_BIT);
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+
+*/
+int32 pstm_mul_2(pstm_int * a, pstm_int * b)
+{
+	int32	res;
+	int16	x, oldused;
+
+/*
+	grow to accomodate result
+ */
+	if (b->alloc < a->used + 1) {
+		if ((res = pstm_grow (b, a->used + 1)) != PSTM_OKAY) {
+			return res;
+		}
+	}
+	oldused = b->used;
+	b->used = a->used;
+
+	{
+		register pstm_digit r, rr, *tmpa, *tmpb;
+
+		/* alias for source */
+		tmpa = a->dp;
+
+		/* alias for dest */
+		tmpb = b->dp;
+
+		/* carry */
+		r = 0;
+		for (x = 0; x < a->used; x++) {
+/*
+			get what will be the *next* carry bit from the
+			MSB of the current digit
+*/
+			rr = *tmpa >> ((pstm_digit)(DIGIT_BIT - 1));
+/*
+			now shift up this digit, add in the carry [from the previous]
+*/
+			*tmpb++ = ((*tmpa++ << ((pstm_digit)1)) | r);
+/*
+			copy the carry that would be from the source
+			digit into the next iteration
+*/
+			r = rr;
+		}
+
+		/* new leading digit? */
+		if (r != 0 && b->used != (PSTM_MAX_SIZE-1)) {
+			/* add a MSB which is always 1 at this point */
+			*tmpb = 1;
+			++(b->used);
+		}
+/*
+		now zero any excess digits on the destination that we didn't write to
+*/
+		tmpb = b->dp + b->used;
+		for (x = b->used; x < oldused; x++) {
+			*tmpb++ = 0;
+		}
+	}
+	b->sign = a->sign;
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+	unsigned subtraction ||a|| >= ||b|| ALWAYS!
+*/
+int32 s_pstm_sub(pstm_int *a, pstm_int *b, pstm_int *c)
+{
+	int16		oldbused, oldused;
+	int32		x;
+	pstm_word	t;
+
+	if (b->used > a->used) {
+		return PS_LIMIT_FAIL;
+	}
+	if (c->alloc < a->used) {
+		if ((x = pstm_grow (c, a->used)) != PSTM_OKAY) {
+			return x;
+		}
+	}
+	oldused  = c->used;
+	oldbused = b->used;
+	c->used  = a->used;
+	t = 0;
+
+	for (x = 0; x < oldbused; x++) {
+		t = ((pstm_word)a->dp[x]) - (((pstm_word)b->dp[x]) + t);
+		c->dp[x] = (pstm_digit)t;
+		t = (t >> DIGIT_BIT)&1;
+	}
+	for (; x < a->used; x++) {
+		t = ((pstm_word)a->dp[x]) - t;
+		c->dp[x] = (pstm_digit)t;
+		t = (t >> DIGIT_BIT);
+	}
+	for (; x < oldused; x++) {
+		c->dp[x] = 0;
+	}
+	pstm_clamp(c);
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+	unsigned addition
+*/
+static int32 s_pstm_add(pstm_int *a, pstm_int *b, pstm_int *c)
+{
+	int16				x, y, oldused;
+	register pstm_word	t, adp, bdp;
+
+	y = a->used;
+	if (b->used > y) {
+		y = b->used;
+	}
+	oldused = c->used;
+	c->used = y;
+
+	if (c->used > c->alloc) {
+		if (pstm_grow(c, c->used) != PSTM_OKAY) {
+			return PS_MEM_FAIL;
+		}
+	}
+
+	t = 0;
+	for (x = 0; x < y; x++) {
+		if (a->used < x) {
+			adp = 0;
+		} else {
+			adp = (pstm_word)a->dp[x];
+		}
+		if (b->used < x) {
+			bdp = 0;
+		} else {
+			bdp = (pstm_word)b->dp[x];
+		}
+		t         += (adp) + (bdp);
+		c->dp[x]   = (pstm_digit)t;
+		t        >>= DIGIT_BIT;
+	}
+	if (t != 0 && x < PSTM_MAX_SIZE) {
+		if (c->used == c->alloc) {
+			if (pstm_grow(c, c->alloc + 1) != PSTM_OKAY) {
+				return PS_MEM_FAIL;
+			}
+		}
+		c->dp[c->used++] = (pstm_digit)t;
+		++x;
+	}
+
+	c->used = x;
+	for (; x < oldused; x++) {
+		c->dp[x] = 0;
+	}
+	pstm_clamp(c);
+	return PSTM_OKAY;
+}
+
+
+/******************************************************************************/
+/*
+
+*/
+int32 pstm_sub(pstm_int *a, pstm_int *b, pstm_int *c)
+{
+	int32     res;
+	int16     sa, sb;
+
+	sa = a->sign;
+	sb = b->sign;
+
+	if (sa != sb) {
+/*
+		subtract a negative from a positive, OR a positive from a negative.
+		For both, ADD their magnitudes, and use the sign of the first number.
+ */
+		c->sign = sa;
+		if ((res = s_pstm_add (a, b, c)) != PSTM_OKAY) {
+			return res;
+		}
+	} else {
+/*
+		subtract a positive from a positive, OR a negative from a negative.
+		First, take the difference between their magnitudes, then...
+ */
+		if (pstm_cmp_mag (a, b) != PSTM_LT) {
+			/* Copy the sign from the first */
+			c->sign = sa;
+			/* The first has a larger or equal magnitude */
+			if ((res = s_pstm_sub (a, b, c)) != PSTM_OKAY) {
+				return res;
+			}
+		} else {
+			/* The result has the _opposite_ sign from the first number. */
+			c->sign = (sa == PSTM_ZPOS) ? PSTM_NEG : PSTM_ZPOS;
+			/* The second has a larger magnitude */
+			if ((res = s_pstm_sub (b, a, c)) != PSTM_OKAY) {
+				return res;
+			}
+		}
+	}
+	return PS_SUCCESS;
+}
+
+/******************************************************************************/
+/*
+	c = a - b
+*/
+int32 pstm_sub_d(psPool_t *pool, pstm_int *a, pstm_digit b, pstm_int *c)
+{
+	pstm_int	tmp;
+	int32		res;
+
+	if (pstm_init_size(pool, &tmp, sizeof(pstm_digit)) != PSTM_OKAY) {
+		return PS_MEM_FAIL;
+	}
+	pstm_set(&tmp, b);
+	res = pstm_sub(a, &tmp, c);
+	pstm_clear(&tmp);
+	return res;
+}
+
+/******************************************************************************/
+/*
+	setups the montgomery reduction
+*/
+int32 pstm_montgomery_setup(pstm_int *a, pstm_digit *rho)
+{
+	pstm_digit x, b;
+
+/*
+	fast inversion mod 2**k
+	Based on the fact that
+	XA = 1 (mod 2**n)	=>  (X(2-XA)) A = 1 (mod 2**2n)
+						=>  2*X*A - X*X*A*A = 1
+						=>  2*(1) - (1)     = 1
+ */
+	b = a->dp[0];
+
+	if ((b & 1) == 0) {
+		psTraceCrypto("pstm_montogomery_setup failure\n");
+		return PS_ARG_FAIL;
+	}
+
+	x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
+	x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
+	x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
+	x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
+#ifdef PSTM_64BIT
+	x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
+#endif
+	/* rho = -1/m mod b */
+	*rho = (pstm_digit)(((pstm_word) 1 << ((pstm_word) DIGIT_BIT)) -
+		((pstm_word)x));
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+ *	computes a = B**n mod b without division or multiplication useful for
+ *	normalizing numbers in a Montgomery system.
+ */
+int32 pstm_montgomery_calc_normalization(pstm_int *a, pstm_int *b)
+{
+	int32     x;
+	int16     bits;
+
+	/* how many bits of last digit does b use */
+	bits = pstm_count_bits (b) % DIGIT_BIT;
+	if (!bits) bits = DIGIT_BIT;
+
+	/* compute A = B^(n-1) * 2^(bits-1) */
+	if (b->used > 1) {
+		if ((x = pstm_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) !=
+				PSTM_OKAY) {
+			return x;
+		}
+	} else {
+		pstm_set(a, 1);
+		bits = 1;
+	}
+
+	/* now compute C = A * B mod b */
+	for (x = bits - 1; x < (int32)DIGIT_BIT; x++) {
+		if (pstm_mul_2 (a, a) != PSTM_OKAY) {
+			return PS_MEM_FAIL;
+		}
+		if (pstm_cmp_mag (a, b) != PSTM_LT) {
+			if (s_pstm_sub (a, b, a) != PSTM_OKAY) {
+				return PS_MEM_FAIL;
+			}
+		}
+	}
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+	c = a * 2**d
+*/
+static int32 pstm_mul_2d(pstm_int *a, int16 b, pstm_int *c)
+{
+	pstm_digit	carry, carrytmp, shift;
+	int16		x;
+
+	/* copy it */
+	if (pstm_copy(a, c) != PSTM_OKAY) {
+		return PS_MEM_FAIL;
+	}
+
+	/* handle whole digits */
+	if (b >= DIGIT_BIT) {
+		if (pstm_lshd(c, b/DIGIT_BIT) != PSTM_OKAY) {
+			return PS_MEM_FAIL;
+		}
+	}
+	b %= DIGIT_BIT;
+
+	/* shift the digits */
+	if (b != 0) {
+		carry = 0;
+		shift = DIGIT_BIT - b;
+		for (x = 0; x < c->used; x++) {
+			carrytmp = c->dp[x] >> shift;
+			c->dp[x] = (c->dp[x] << b) + carry;
+			carry = carrytmp;
+		}
+		/* store last carry if room */
+		if (carry && x < PSTM_MAX_SIZE) {
+			if (c->used == c->alloc) {
+				if (pstm_grow(c, c->alloc + 1) != PSTM_OKAY) {
+					return PS_MEM_FAIL;
+				}
+			}
+			c->dp[c->used++] = carry;
+		}
+	}
+	pstm_clamp(c);
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+	c = a mod 2**d
+*/
+static int32 pstm_mod_2d(pstm_int *a, int16 b, pstm_int *c)
+{
+	int16	x;
+
+	/* zero if count less than or equal to zero */
+	if (b <= 0) {
+		pstm_zero(c);
+		return PSTM_OKAY;
+	}
+
+	/* get copy of input */
+	if (pstm_copy(a, c) != PSTM_OKAY) {
+		return PS_MEM_FAIL;
+	}
+
+	/* if 2**d is larger than we just return */
+	if (b >= (DIGIT_BIT * a->used)) {
+		return PSTM_OKAY;
+	}
+
+	/* zero digits above the last digit of the modulus */
+	for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++)
+	{
+		c->dp[x] = 0;
+	}
+	/* clear the digit that is not completely outside/inside the modulus */
+	c->dp[b / DIGIT_BIT] &= ~((pstm_digit)0) >> (DIGIT_BIT - b);
+	pstm_clamp (c);
+	return PSTM_OKAY;
+}
+
+
+/******************************************************************************/
+/*
+	c = a * b
+*/
+int32 pstm_mul_d(pstm_int *a, pstm_digit b, pstm_int *c)
+{
+	pstm_word	w;
+	int32		res;
+	int16		x, oldused;
+
+	if (c->alloc < a->used + 1) {
+		if ((res = pstm_grow (c, a->used + 1)) != PSTM_OKAY) {
+			return res;
+		}
+	}
+	oldused = c->used;
+	c->used = a->used;
+	c->sign = a->sign;
+	w       = 0;
+	for (x = 0; x < a->used; x++) {
+		w         = ((pstm_word)a->dp[x]) * ((pstm_word)b) + w;
+		c->dp[x]  = (pstm_digit)w;
+		w         = w >> DIGIT_BIT;
+	}
+	if (w != 0 && (a->used != PSTM_MAX_SIZE)) {
+		c->dp[c->used++] = (pstm_digit)w;
+		++x;
+	}
+	for (; x < oldused; x++) {
+		c->dp[x] = 0;
+	}
+	pstm_clamp(c);
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+	c = a / 2**b
+*/
+int32 pstm_div_2d(psPool_t *pool, pstm_int *a, int16 b, pstm_int *c,
+					pstm_int *d)
+{
+	pstm_digit	D, r, rr;
+	int32		res;
+	int16		x;
+	pstm_int	t;
+
+	/* if the shift count is <= 0 then we do no work */
+	if (b <= 0) {
+		if (pstm_copy (a, c) != PSTM_OKAY) {
+			return PS_MEM_FAIL;
+		}
+		if (d != NULL) {
+			pstm_zero (d);
+		}
+		return PSTM_OKAY;
+	}
+
+	/* get the remainder */
+	if (d != NULL) {
+		if (pstm_init(pool, &t) != PSTM_OKAY) {
+			return PS_MEM_FAIL;
+		}
+		if (pstm_mod_2d (a, b, &t) != PSTM_OKAY) {
+			res = PS_MEM_FAIL;
+			goto LBL_DONE;
+		}
+	}
+
+	/* copy */
+	if (pstm_copy(a, c) != PSTM_OKAY) {
+		res = PS_MEM_FAIL;
+		goto LBL_DONE;
+	}
+
+	/* shift by as many digits in the bit count */
+	if (b >= (int32)DIGIT_BIT) {
+		pstm_rshd (c, b / DIGIT_BIT);
+	}
+
+	/* shift any bit count < DIGIT_BIT */
+	D = (pstm_digit) (b % DIGIT_BIT);
+	if (D != 0) {
+		register pstm_digit *tmpc, mask, shift;
+
+		/* mask */
+		mask = (((pstm_digit)1) << D) - 1;
+
+		/* shift for lsb */
+		shift = DIGIT_BIT - D;
+
+		/* alias */
+		tmpc = c->dp + (c->used - 1);
+
+		/* carry */
+		r = 0;
+		for (x = c->used - 1; x >= 0; x--) {
+			/* get the lower  bits of this word in a temp */
+			rr = *tmpc & mask;
+
+			/* shift the current word and mix in the carry bits from previous */
+			*tmpc = (*tmpc >> D) | (r << shift);
+			--tmpc;
+
+			/* set the carry to the carry bits of the current word above */
+			r = rr;
+		}
+	}
+	pstm_clamp (c);
+
+	res = PSTM_OKAY;
+LBL_DONE:
+	if (d != NULL) {
+		if (pstm_copy(&t, d) != PSTM_OKAY) {
+			res = PS_MEM_FAIL;
+		}
+		pstm_clear(&t);
+	}
+	return res;
+}
+
+/******************************************************************************/
+/*
+	b = a/2
+*/
+int32 pstm_div_2(pstm_int * a, pstm_int * b)
+{
+	int16	x, oldused;
+
+	if (b->alloc < a->used) {
+		if (pstm_grow(b, a->used) != PSTM_OKAY) {
+			return PS_MEM_FAIL;
+		}
+	}
+	oldused = b->used;
+	b->used = a->used;
+	{
+		register pstm_digit r, rr, *tmpa, *tmpb;
+
+		/* source alias */
+		tmpa = a->dp + b->used - 1;
+
+		/* dest alias */
+		tmpb = b->dp + b->used - 1;
+
+		/* carry */
+		r = 0;
+		for (x = b->used - 1; x >= 0; x--) {
+			/* get the carry for the next iteration */
+			rr = *tmpa & 1;
+
+			/* shift the current digit, add in carry and store */
+			*tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
+
+			/* forward carry to next iteration */
+			r = rr;
+		}
+
+		/* zero excess digits */
+		tmpb = b->dp + b->used;
+		for (x = b->used; x < oldused; x++) {
+			*tmpb++ = 0;
+		}
+	}
+	b->sign = a->sign;
+	pstm_clamp (b);
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+	Creates "a" then copies b into it
+ */
+int32 pstm_init_copy(psPool_t *pool, pstm_int * a, pstm_int * b, int16 toSqr)
+{
+	int16	x;
+	int32	res;
+
+	if (a == b) {
+		return PSTM_OKAY;
+	}
+	x = b->alloc;
+
+	if (toSqr) {
+/*
+		Smart-size:  Increasing size of a if b->used is roughly half
+		of b->alloc because usage has shown that a lot of these copies
+		go on to be squared and need these extra digits
+*/
+		if ((b->used * 2) + 2 >= x) {
+			x = (b->used * 2) + 3;
+		}
+	}
+	if ((res = pstm_init_size(pool, a, x)) != PSTM_OKAY) {
+		return res;
+	}
+	return pstm_copy(b, a);
+}
+
+/******************************************************************************/
+/*
+	With some compilers, we have seen issues linking with the builtin
+	64 bit division routine. The issues with either manifest in a failure
+	to find 'udivdi3' at link time, or a runtime invalid instruction fault
+	during an RSA operation.
+	The routine below divides a 64 bit unsigned int by a 32 bit unsigned int
+	explicitly, rather than using the division operation
+		The 64 bit result is placed in the 'numerator' parameter
+		The 32 bit mod (remainder) of the division is the return parameter
+	Based on implementations by:
+		Copyright (C) 2003 Bernardo Innocenti <bernie@develer.com>
+		Copyright (C) 1999 Hewlett-Packard Co
+		Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com>
+*/
+#if defined(USE_MATRIX_DIV64) && defined(PSTM_32BIT)
+static uint32 psDiv64(uint64 *numerator, uint32 denominator)
+{
+	uint64	rem = *numerator;
+	uint64	b = denominator;
+	uint64	res = 0;
+	uint64	d = 1;
+	uint32	high = rem >> 32;
+
+	if (high >= denominator) {
+		high /= denominator;
+		res = (uint64) high << 32;
+		rem -= (uint64) (high * denominator) << 32;
+	}
+	while ((int64)b > 0 && b < rem) {
+		b = b+b;
+		d = d+d;
+	}
+	do {
+		if (rem >= b) {
+			rem -= b;
+			res += d;
+		}
+		b >>= 1;
+		d >>= 1;
+	} while (d);
+	*numerator = res;
+	return rem;
+}
+#endif /* USE_MATRIX_DIV64 */
+
+#if defined(USE_MATRIX_DIV128) && defined(PSTM_64BIT)
+typedef unsigned long	uint128 __attribute__ ((mode(TI)));
+static uint64 psDiv128(uint128 *numerator, uint64 denominator)
+{
+	uint128	rem = *numerator;
+	uint128	b = denominator;
+	uint128	res = 0;
+	uint128	d = 1;
+	uint64	high = rem >> 64;
+
+	if (high >= denominator) {
+		high /= denominator;
+		res = (uint128) high << 64;
+		rem -= (uint128) (high * denominator) << 64;
+	}
+	while ((uint128)b > 0 && b < rem) {
+		b = b+b;
+		d = d+d;
+	}
+	do {
+		if (rem >= b) {
+			rem -= b;
+			res += d;
+		}
+		b >>= 1;
+		d >>= 1;
+	} while (d);
+	*numerator = res;
+	return rem;
+}
+#endif /* USE_MATRIX_DIV128 */
+
+/******************************************************************************/
+/*
+	a/b => cb + d == a
+*/
+int32 pstm_div(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c,
+				pstm_int *d)
+{
+	pstm_int	q, x, y, t1, t2;
+	int32		res;
+	int16		n, t, i, norm, neg;
+
+	/* is divisor zero ? */
+	if (pstm_iszero (b) == 1) {
+		return PS_LIMIT_FAIL;
+	}
+
+	/* if a < b then q=0, r = a */
+	if (pstm_cmp_mag (a, b) == PSTM_LT) {
+		if (d != NULL) {
+			if (pstm_copy(a, d) != PSTM_OKAY) {
+				return PS_MEM_FAIL;
+			}
+		}
+		if (c != NULL) {
+			pstm_zero (c);
+		}
+		return PSTM_OKAY;
+	}
+/*
+	Smart-size inits
+*/
+	if ((res = pstm_init_size(pool, &t1, a->alloc)) != PSTM_OKAY) {
+		return res;
+	}
+	if ((res = pstm_init_size(pool, &t2, 3)) != PSTM_OKAY) {
+		goto LBL_T1;
+	}
+	if ((res = pstm_init_copy(pool, &x, a, 0)) != PSTM_OKAY) {
+		goto LBL_T2;
+	}
+/*
+	Used to be an init_copy on b but pstm_grow was always hit with triple size
+*/
+	if ((res = pstm_init_size(pool, &y, b->used * 3)) != PSTM_OKAY) {
+		goto LBL_X;
+	}
+	if ((res = pstm_copy(b, &y)) != PSTM_OKAY) {
+		goto LBL_Y;
+	}
+
+	/* fix the sign */
+	neg = (a->sign == b->sign) ? PSTM_ZPOS : PSTM_NEG;
+	x.sign = y.sign = PSTM_ZPOS;
+
+	/* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
+	norm = pstm_count_bits(&y) % DIGIT_BIT;
+	if (norm < (int32)(DIGIT_BIT-1)) {
+		norm = (DIGIT_BIT-1) - norm;
+		if ((res = pstm_mul_2d(&x, norm, &x)) != PSTM_OKAY) {
+			goto LBL_Y;
+		}
+		if ((res = pstm_mul_2d(&y, norm, &y)) != PSTM_OKAY) {
+			goto LBL_Y;
+		}
+	} else {
+		norm = 0;
+	}
+
+	/* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
+	n = x.used - 1;
+	t = y.used - 1;
+
+	if ((res = pstm_init_size(pool, &q, n - t + 1)) != PSTM_OKAY) {
+		goto LBL_Y;
+	}
+	q.used = n - t + 1;
+
+	/* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
+	if ((res = pstm_lshd(&y, n - t)) != PSTM_OKAY) { /* y = y*b**{n-t} */
+		goto LBL_Q;
+	}
+
+	while (pstm_cmp (&x, &y) != PSTM_LT) {
+		++(q.dp[n - t]);
+		if ((res = pstm_sub(&x, &y, &x)) != PSTM_OKAY) {
+			goto LBL_Q;
+		}
+	}
+
+	/* reset y by shifting it back down */
+	pstm_rshd (&y, n - t);
+
+	/* step 3. for i from n down to (t + 1) */
+	for (i = n; i >= (t + 1); i--) {
+		if (i > x.used) {
+			continue;
+		}
+
+		/* step 3.1 if xi == yt then set q{i-t-1} to b-1,
+		 * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
+		if (x.dp[i] == y.dp[t]) {
+			q.dp[i - t - 1] = (pstm_digit)((((pstm_word)1) << DIGIT_BIT) - 1);
+		} else {
+			pstm_word tmp;
+			tmp = ((pstm_word) x.dp[i]) << ((pstm_word) DIGIT_BIT);
+			tmp |= ((pstm_word) x.dp[i - 1]);
+#if defined(USE_MATRIX_DIV64) && defined(PSTM_32BIT)
+			psDiv64(&tmp, y.dp[t]);
+#elif defined(USE_MATRIX_DIV128) && defined(PSTM_64BIT)
+			psDiv128(&tmp, y.dp[t]);
+#else
+			tmp /= ((pstm_word) y.dp[t]);
+#endif /* USE_MATRIX_DIV64 */
+			q.dp[i - t - 1] = (pstm_digit) (tmp);
+		}
+
+		/* while (q{i-t-1} * (yt * b + y{t-1})) >
+			 xi * b**2 + xi-1 * b + xi-2
+
+			do q{i-t-1} -= 1;
+		*/
+		q.dp[i - t - 1] = (q.dp[i - t - 1] + 1);
+		do {
+			q.dp[i - t - 1] = (q.dp[i - t - 1] - 1);
+
+			/* find left hand */
+			pstm_zero (&t1);
+			t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
+			t1.dp[1] = y.dp[t];
+			t1.used = 2;
+			if ((res = pstm_mul_d (&t1, q.dp[i - t - 1], &t1)) != PSTM_OKAY) {
+				goto LBL_Q;
+			}
+
+			/* find right hand */
+			t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
+			t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
+			t2.dp[2] = x.dp[i];
+			t2.used = 3;
+		} while (pstm_cmp_mag(&t1, &t2) == PSTM_GT);
+
+		/* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
+		if ((res = pstm_mul_d(&y, q.dp[i - t - 1], &t1)) != PSTM_OKAY) {
+			goto LBL_Q;
+		}
+
+		if ((res = pstm_lshd(&t1, i - t - 1)) != PSTM_OKAY) {
+			goto LBL_Q;
+		}
+
+		if ((res = pstm_sub(&x, &t1, &x)) != PSTM_OKAY) {
+			goto LBL_Q;
+		}
+
+		/* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
+		if (x.sign == PSTM_NEG) {
+			if ((res = pstm_copy(&y, &t1)) != PSTM_OKAY) {
+				goto LBL_Q;
+			}
+			if ((res = pstm_lshd (&t1, i - t - 1)) != PSTM_OKAY) {
+				goto LBL_Q;
+			}
+			if ((res = pstm_add (&x, &t1, &x)) != PSTM_OKAY) {
+				goto LBL_Q;
+			}
+			q.dp[i - t - 1] = q.dp[i - t - 1] - 1;
+		}
+	}
+/*
+	now q is the quotient and x is the remainder (which we have to normalize)
+*/
+	/* get sign before writing to c */
+	x.sign = x.used == 0 ? PSTM_ZPOS : a->sign;
+
+	if (c != NULL) {
+		pstm_clamp (&q);
+		if (pstm_copy (&q, c) != PSTM_OKAY) {
+			res = PS_MEM_FAIL;
+			goto LBL_Q;
+		}
+		c->sign = neg;
+	}
+
+	if (d != NULL) {
+		if ((res = pstm_div_2d (pool, &x, norm, &x, NULL)) != PSTM_OKAY) {
+			goto LBL_Q;
+		}
+/*
+		the following is a kludge, essentially we were seeing the right
+		remainder but with excess digits that should have been zero
+ */
+		for (i = b->used; i < x.used; i++) {
+			x.dp[i] = 0;
+		}
+		pstm_clamp(&x);
+		if (pstm_copy (&x, d) != PSTM_OKAY) {
+			res = PS_MEM_FAIL;
+			goto LBL_Q;
+		}
+	}
+
+	res = PSTM_OKAY;
+
+LBL_Q:pstm_clear (&q);
+LBL_Y:pstm_clear (&y);
+LBL_X:pstm_clear (&x);
+LBL_T2:pstm_clear (&t2);
+LBL_T1:pstm_clear (&t1);
+
+	return res;
+}
+
+/******************************************************************************/
+/*
+	Swap the elements of two integers, for cases where you can't simply swap
+	the pstm_int pointers around
+*/
+void pstm_exch(pstm_int * a, pstm_int * b)
+{
+	pstm_int		t;
+
+	t	= *a;
+	*a	= *b;
+	*b	= t;
+}
+
+/******************************************************************************/
+/*
+	c = a mod b, 0 <= c < b
+*/
+int32 pstm_mod(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c)
+{
+	pstm_int	t;
+	int32		err;
+/*
+	Smart-size
+*/
+	if ((err = pstm_init_size(pool, &t, b->alloc)) != PSTM_OKAY) {
+		return err;
+	}
+	if ((err = pstm_div(pool, a, b, NULL, &t)) != PSTM_OKAY) {
+		pstm_clear (&t);
+		return err;
+	}
+	if (t.sign != b->sign) {
+		err = pstm_add(&t, b, c);
+	} else {
+		pstm_exch (&t, c);
+	}
+	pstm_clear (&t);
+	return err;
+}
+
+/******************************************************************************/
+/*
+	d = a * b (mod c)
+*/
+int32 pstm_mulmod(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c,
+			pstm_int *d)
+{
+	int32		res;
+	int16		size;
+	pstm_int	tmp;
+
+/*
+	Smart-size pstm_inits.  d is an output that is influenced by this local 't'
+	so don't shrink 'd' if it wants to becuase this will lead to an pstm_grow
+	in RSA operations
+*/
+	size = a->used + b->used + 1;
+	if ((a == d) && (size < a->alloc)) {
+		size = a->alloc;
+	}
+	if ((res = pstm_init_size(pool, &tmp, size)) != PSTM_OKAY) {
+		return res;
+	}
+	if ((res = pstm_mul_comba(pool, a, b, &tmp, NULL, 0)) != PSTM_OKAY) {
+		pstm_clear(&tmp);
+		return res;
+	}
+	res = pstm_mod(pool, &tmp, c, d);
+	pstm_clear(&tmp);
+	return res;
+}
+
+/******************************************************************************/
+/*
+ *	y = g**x (mod b)
+ *	Some restrictions... x must be positive and < b
+ */
+int32 pstm_exptmod(psPool_t *pool, pstm_int *G, pstm_int *X, pstm_int *P,
+			pstm_int *Y)
+{
+	pstm_int	M[32], res; /* Keep this winsize based: (1 << max_winsize) */
+	pstm_digit	buf, mp;
+	pstm_digit	*paD;
+	int32		err, bitbuf;
+	int16		bitcpy, bitcnt, mode, digidx, x, y, winsize;
+	uint32		paDlen;
+
+	/* set window size from what user set as optimization */
+	x = pstm_count_bits(X);
+	if (x < 50) {
+		winsize = 2;
+	} else {
+		winsize = PS_EXPTMOD_WINSIZE;
+	}
+
+	/* now setup montgomery  */
+	if ((err = pstm_montgomery_setup (P, &mp)) != PSTM_OKAY) {
+		return err;
+	}
+
+	/* setup result */
+	if ((err = pstm_init_size(pool, &res, (P->used * 2) + 1)) != PSTM_OKAY) {
+		return err;
+	}
+/*
+	create M table
+	The M table contains powers of the input base, e.g. M[x] = G^x mod P
+	The first half of the table is not computed though except for M[0] and M[1]
+ */
+	/* now we need R mod m */
+	if ((err = pstm_montgomery_calc_normalization (&res, P)) != PSTM_OKAY) {
+		goto LBL_RES;
+	}
+/*
+	init M array
+	init first cell
+ */
+	if ((err = pstm_init_size(pool, &M[1], res.used)) != PSTM_OKAY) {
+		goto LBL_RES;
+	}
+
+	/* now set M[1] to G * R mod m */
+	if (pstm_cmp_mag(P, G) != PSTM_GT) {
+		/* G > P so we reduce it first */
+		if ((err = pstm_mod(pool, G, P, &M[1])) != PSTM_OKAY) {
+			goto LBL_M;
+		}
+	} else {
+		if ((err = pstm_copy(G, &M[1])) != PSTM_OKAY) {
+			goto LBL_M;
+		}
+	}
+	if ((err = pstm_mulmod (pool, &M[1], &res, P, &M[1])) != PSTM_OKAY) {
+		goto LBL_M;
+	}
+/*
+	Pre-allocated digit.  Used for mul, sqr, AND reduce
+*/
+	paDlen = ((M[1].used + 3) * 2) * sizeof(pstm_digit);
+	paD = xzalloc(paDlen);
+/*
+ 	compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times
+ */
+	if (pstm_init_copy(pool, &M[1 << (winsize - 1)], &M[1], 1) != PSTM_OKAY) {
+		err = PS_MEM_FAIL;
+		goto LBL_PAD;
+	}
+	for (x = 0; x < (winsize - 1); x++) {
+		if ((err = pstm_sqr_comba (pool, &M[1 << (winsize - 1)],
+				&M[1 << (winsize - 1)], paD, paDlen)) != PSTM_OKAY) {
+			goto LBL_PAD;
+		}
+		if ((err = pstm_montgomery_reduce(pool, &M[1 << (winsize - 1)], P, mp,
+				paD, paDlen)) != PSTM_OKAY) {
+			goto LBL_PAD;
+		}
+	}
+/*
+	now init the second half of the array
+*/
+	for (x = (1<<(winsize-1)) + 1; x < (1 << winsize); x++) {
+		if ((err = pstm_init_size(pool, &M[x], M[1<<(winsize-1)].alloc + 1))
+				!= PSTM_OKAY) {
+			for (y = 1<<(winsize-1); y < x; y++) {
+				pstm_clear(&M[y]);
+			}
+			goto LBL_PAD;
+		}
+	}
+
+	/* create upper table */
+	for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
+		if ((err = pstm_mul_comba(pool, &M[x - 1], &M[1], &M[x], paD, paDlen))
+				!= PSTM_OKAY) {
+			goto LBL_MARRAY;
+		}
+		if ((err = pstm_montgomery_reduce(pool, &M[x], P, mp, paD, paDlen)) !=
+				PSTM_OKAY) {
+			goto LBL_MARRAY;
+		}
+	}
+
+	/* set initial mode and bit cnt */
+	mode   = 0;
+	bitcnt = 1;
+	buf    = 0;
+	digidx = X->used - 1;
+	bitcpy = 0;
+	bitbuf = 0;
+
+	for (;;) {
+		/* grab next digit as required */
+		if (--bitcnt == 0) {
+			/* if digidx == -1 we are out of digits so break */
+			if (digidx == -1) {
+				break;
+			}
+			/* read next digit and reset bitcnt */
+			buf    = X->dp[digidx--];
+			bitcnt = (int32)DIGIT_BIT;
+		}
+
+		/* grab the next msb from the exponent */
+		y     = (pstm_digit)(buf >> (DIGIT_BIT - 1)) & 1;
+		buf <<= (pstm_digit)1;
+/*
+		 If the bit is zero and mode == 0 then we ignore it.
+		 These represent the leading zero bits before the first 1 bit
+		 in the exponent.  Technically this opt is not required but it
+		 does lower the # of trivial squaring/reductions used
+*/
+		if (mode == 0 && y == 0) {
+			continue;
+		}
+
+		/* if the bit is zero and mode == 1 then we square */
+		if (mode == 1 && y == 0) {
+			if ((err = pstm_sqr_comba(pool, &res, &res, paD, paDlen)) !=
+					PSTM_OKAY) {
+				goto LBL_MARRAY;
+			}
+			if ((err = pstm_montgomery_reduce(pool, &res, P, mp, paD, paDlen))
+					!= PSTM_OKAY) {
+				goto LBL_MARRAY;
+			}
+			continue;
+		}
+
+		/* else we add it to the window */
+		bitbuf |= (y << (winsize - ++bitcpy));
+		mode    = 2;
+
+		if (bitcpy == winsize) {
+			/* ok window is filled so square as required and mul square first */
+			for (x = 0; x < winsize; x++) {
+				if ((err = pstm_sqr_comba(pool, &res, &res, paD, paDlen)) !=
+						PSTM_OKAY) {
+					goto LBL_MARRAY;
+				}
+				if ((err = pstm_montgomery_reduce(pool, &res, P, mp, paD,
+						paDlen)) != PSTM_OKAY) {
+					goto LBL_MARRAY;
+				}
+			}
+
+			/* then multiply */
+			if ((err = pstm_mul_comba(pool, &res, &M[bitbuf], &res, paD,
+					paDlen)) != PSTM_OKAY) {
+				goto LBL_MARRAY;
+			}
+			if ((err = pstm_montgomery_reduce(pool, &res, P, mp, paD, paDlen))
+					!= PSTM_OKAY) {
+				goto LBL_MARRAY;
+			}
+
+			/* empty window and reset */
+			bitcpy = 0;
+			bitbuf = 0;
+			mode   = 1;
+		}
+	}
+
+	/* if bits remain then square/multiply */
+	if (mode == 2 && bitcpy > 0) {
+		/* square then multiply if the bit is set */
+		for (x = 0; x < bitcpy; x++) {
+			if ((err = pstm_sqr_comba(pool, &res, &res, paD, paDlen)) !=
+					PSTM_OKAY) {
+				goto LBL_MARRAY;
+			}
+			if ((err = pstm_montgomery_reduce(pool, &res, P, mp, paD, paDlen))
+					!= PSTM_OKAY) {
+				goto LBL_MARRAY;
+			}
+
+			/* get next bit of the window */
+			bitbuf <<= 1;
+			if ((bitbuf & (1 << winsize)) != 0) {
+			/* then multiply */
+				if ((err = pstm_mul_comba(pool, &res, &M[1], &res, paD, paDlen))
+						!= PSTM_OKAY) {
+					goto LBL_MARRAY;
+				}
+				if ((err = pstm_montgomery_reduce(pool, &res, P, mp, paD,
+						paDlen)) != PSTM_OKAY) {
+					goto LBL_MARRAY;
+				}
+			}
+		}
+	}
+/*
+	Fix up result if Montgomery reduction is used recall that any value in a
+	Montgomery system is actually multiplied by R mod n.  So we have to reduce
+	one more time to cancel out the factor of R.
+*/
+	if ((err = pstm_montgomery_reduce(pool, &res, P, mp, paD, paDlen)) !=
+			PSTM_OKAY) {
+		goto LBL_MARRAY;
+	}
+	/* swap res with Y */
+	if ((err = pstm_copy (&res, Y)) != PSTM_OKAY) {
+		goto LBL_MARRAY;
+	}
+	err = PSTM_OKAY;
+LBL_MARRAY:
+	for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
+		pstm_clear(&M[x]);
+	}
+LBL_PAD:psFree(paD, pool);
+LBL_M: pstm_clear(&M[1]);
+LBL_RES:pstm_clear(&res);
+	return err;
+}
+
+/******************************************************************************/
+/*
+
+*/
+int32 pstm_add(pstm_int *a, pstm_int *b, pstm_int *c)
+{
+	int32	res;
+	int16	sa, sb;
+
+	/* get sign of both inputs */
+	sa = a->sign;
+	sb = b->sign;
+
+	/* handle two cases, not four */
+	if (sa == sb) {
+		/* both positive or both negative, add their mags, copy the sign */
+		c->sign = sa;
+		if ((res = s_pstm_add (a, b, c)) != PSTM_OKAY) {
+			return res;
+		}
+  } else {
+/*
+		one positive, the other negative
+		subtract the one with the greater magnitude from the one of the lesser
+		magnitude. The result gets the sign of the one with the greater mag.
+ */
+		if (pstm_cmp_mag (a, b) == PSTM_LT) {
+			c->sign = sb;
+			if ((res = s_pstm_sub (b, a, c)) != PSTM_OKAY) {
+				return res;
+			}
+		} else {
+			c->sign = sa;
+			if ((res = s_pstm_sub (a, b, c)) != PSTM_OKAY) {
+				return res;
+			}
+		}
+	}
+	return PS_SUCCESS;
+}
+
+/******************************************************************************/
+/*
+	reverse an array, used for radix code
+*/
+static void pstm_reverse (unsigned char *s, int16 len)
+{
+	int32     ix, iy;
+	unsigned char t;
+
+	ix = 0;
+	iy = len - 1;
+	while (ix < iy) {
+		t     = s[ix];
+		s[ix] = s[iy];
+		s[iy] = t;
+		++ix;
+		--iy;
+	}
+}
+/******************************************************************************/
+/*
+	No reverse.  Useful in some of the EIP-154 PKA stuff where special byte
+	order seems to come into play more often
+*/
+int32 pstm_to_unsigned_bin_nr(psPool_t *pool, pstm_int *a, unsigned char *b)
+{
+	int32     res;
+	int16     x;
+	pstm_int  t = { 0 };
+
+	if ((res = pstm_init_copy(pool, &t, a, 0)) != PSTM_OKAY) {
+		return res;
+	}
+
+	x = 0;
+	while (pstm_iszero (&t) == 0) {
+		b[x++] = (unsigned char) (t.dp[0] & 255);
+		if ((res = pstm_div_2d (pool, &t, 8, &t, NULL)) != PSTM_OKAY) {
+			pstm_clear(&t);
+			return res;
+		}
+	}
+	pstm_clear(&t);
+	return PS_SUCCESS;
+}
+/******************************************************************************/
+/*
+
+*/
+int32 pstm_to_unsigned_bin(psPool_t *pool, pstm_int *a, unsigned char *b)
+{
+	int32     res;
+	int16     x;
+	pstm_int  t = { 0 };
+
+	if ((res = pstm_init_copy(pool, &t, a, 0)) != PSTM_OKAY) {
+		return res;
+	}
+
+	x = 0;
+	while (pstm_iszero (&t) == 0) {
+		b[x++] = (unsigned char) (t.dp[0] & 255);
+		if ((res = pstm_div_2d (pool, &t, 8, &t, NULL)) != PSTM_OKAY) {
+			pstm_clear(&t);
+			return res;
+		}
+	}
+	pstm_reverse (b, x);
+	pstm_clear(&t);
+	return PS_SUCCESS;
+}
+
+/******************************************************************************/
+/*
+	compare against a single digit
+*/
+int32 pstm_cmp_d(pstm_int *a, pstm_digit b)
+{
+	/* compare based on sign */
+	if ((b && a->used == 0) || a->sign == PSTM_NEG) {
+		return PSTM_LT;
+	}
+
+	/* compare based on magnitude */
+	if (a->used > 1) {
+		return PSTM_GT;
+	}
+
+	/* compare the only digit of a to b */
+	if (a->dp[0] > b) {
+		return PSTM_GT;
+	} else if (a->dp[0] < b) {
+		return PSTM_LT;
+	} else {
+		return PSTM_EQ;
+	}
+}
+
+/*
+	Need invmod for ECC and also private key loading for hardware crypto
+	in cases where dQ > dP.  The values must be switched and a new qP must be
+	calculated using this function
+*/
+static int32 pstm_invmod_slow(psPool_t *pool, pstm_int * a, pstm_int * b,
+				pstm_int * c)
+{
+	pstm_int  x, y, u, v, A, B, C, D;
+	int32     res;
+
+	/* b cannot be negative */
+	if (b->sign == PSTM_NEG || pstm_iszero(b) == 1) {
+		return PS_LIMIT_FAIL;
+	}
+
+	/* init temps */
+	if (pstm_init_size(pool, &x, b->used) != PSTM_OKAY) {
+		return PS_MEM_FAIL;
+	}
+
+	/* x = a, y = b */
+	if ((res = pstm_mod(pool, a, b, &x)) != PSTM_OKAY) {
+		goto LBL_X;
+	}
+
+	if (pstm_init_copy(pool, &y, b, 0) != PSTM_OKAY) {
+		goto LBL_X;
+	}
+
+	/* 2. [modified] if x,y are both even then return an error! */
+	if (pstm_iseven (&x) == 1 && pstm_iseven (&y) == 1) {
+		res = PS_FAILURE;
+		goto LBL_Y;
+	}
+
+	/* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
+	if ((res = pstm_init_copy(pool, &u, &x, 0)) != PSTM_OKAY) {
+		goto LBL_Y;
+	}
+	if ((res = pstm_init_copy(pool, &v, &y, 0)) != PSTM_OKAY) {
+		goto LBL_U;
+	}
+
+	if ((res = pstm_init_size(pool, &A, sizeof(pstm_digit))) != PSTM_OKAY) {
+		goto LBL_V;
+	}
+
+	if ((res = pstm_init_size(pool, &D, sizeof(pstm_digit))) != PSTM_OKAY) {
+		goto LBL_A;
+	}
+	pstm_set (&A, 1);
+	pstm_set (&D, 1);
+
+	if ((res = pstm_init(pool, &B)) != PSTM_OKAY) {
+		goto LBL_D;
+	}
+	if ((res = pstm_init(pool, &C)) != PSTM_OKAY) {
+		goto LBL_B;
+	}
+
+top:
+	/* 4.  while u is even do */
+	while (pstm_iseven (&u) == 1) {
+		/* 4.1 u = u/2 */
+		if ((res = pstm_div_2 (&u, &u)) != PSTM_OKAY) {
+			goto LBL_C;
+		}
+
+		/* 4.2 if A or B is odd then */
+		if (pstm_isodd (&A) == 1 || pstm_isodd (&B) == 1) {
+			/* A = (A+y)/2, B = (B-x)/2 */
+			if ((res = pstm_add (&A, &y, &A)) != PSTM_OKAY) {
+				goto LBL_C;
+			}
+			if ((res = pstm_sub (&B, &x, &B)) != PSTM_OKAY) {
+				goto LBL_C;
+			}
+		}
+		/* A = A/2, B = B/2 */
+		if ((res = pstm_div_2 (&A, &A)) != PSTM_OKAY) {
+			goto LBL_C;
+		}
+		if ((res = pstm_div_2 (&B, &B)) != PSTM_OKAY) {
+			goto LBL_C;
+		}
+	}
+
+	/* 5.  while v is even do */
+	while (pstm_iseven (&v) == 1) {
+		/* 5.1 v = v/2 */
+		if ((res = pstm_div_2 (&v, &v)) != PSTM_OKAY) {
+			goto LBL_C;
+		}
+
+		/* 5.2 if C or D is odd then */
+		if (pstm_isodd (&C) == 1 || pstm_isodd (&D) == 1) {
+			/* C = (C+y)/2, D = (D-x)/2 */
+			if ((res = pstm_add (&C, &y, &C)) != PSTM_OKAY) {
+				goto LBL_C;
+			}
+			if ((res = pstm_sub (&D, &x, &D)) != PSTM_OKAY) {
+				goto LBL_C;
+			}
+		}
+		/* C = C/2, D = D/2 */
+		if ((res = pstm_div_2 (&C, &C)) != PSTM_OKAY) {
+			goto LBL_C;
+		}
+		if ((res = pstm_div_2 (&D, &D)) != PSTM_OKAY) {
+			goto LBL_C;
+		}
+	}
+
+	/* 6.  if u >= v then */
+	if (pstm_cmp (&u, &v) != PSTM_LT) {
+		/* u = u - v, A = A - C, B = B - D */
+		if ((res = pstm_sub (&u, &v, &u)) != PSTM_OKAY) {
+				goto LBL_C;
+		}
+		if ((res = pstm_sub (&A, &C, &A)) != PSTM_OKAY) {
+				goto LBL_C;
+		}
+		if ((res = pstm_sub (&B, &D, &B)) != PSTM_OKAY) {
+				goto LBL_C;
+		}
+	} else {
+		/* v - v - u, C = C - A, D = D - B */
+		if ((res = pstm_sub (&v, &u, &v)) != PSTM_OKAY) {
+				goto LBL_C;
+		}
+		if ((res = pstm_sub (&C, &A, &C)) != PSTM_OKAY) {
+				goto LBL_C;
+		}
+		if ((res = pstm_sub (&D, &B, &D)) != PSTM_OKAY) {
+				goto LBL_C;
+		}
+	}
+
+	/* if not zero goto step 4 */
+	if (pstm_iszero (&u) == 0)
+		goto top;
+
+	/* now a = C, b = D, gcd == g*v */
+
+	/* if v != 1 then there is no inverse */
+	if (pstm_cmp_d (&v, 1) != PSTM_EQ) {
+		res = PS_FAILURE;
+		goto LBL_C;
+	}
+
+	/* if its too low */
+	while (pstm_cmp_d(&C, 0) == PSTM_LT) {
+		if ((res = pstm_add(&C, b, &C)) != PSTM_OKAY) {
+			goto LBL_C;
+		}
+	}
+
+	/* too big */
+	while (pstm_cmp_mag(&C, b) != PSTM_LT) {
+		if ((res = pstm_sub(&C, b, &C)) != PSTM_OKAY) {
+			goto LBL_C;
+		}
+	}
+
+	/* C is now the inverse */
+	if ((res = pstm_copy(&C, c)) != PSTM_OKAY) {
+		goto LBL_C;
+	}
+	res = PSTM_OKAY;
+
+LBL_C: pstm_clear(&C);
+LBL_D: pstm_clear(&D);
+LBL_B: pstm_clear(&B);
+LBL_A: pstm_clear(&A);
+LBL_V: pstm_clear(&v);
+LBL_U: pstm_clear(&u);
+LBL_Y: pstm_clear(&y);
+LBL_X: pstm_clear(&x);
+
+	return res;
+}
+
+/* c = 1/a (mod b) for odd b only */
+int32 pstm_invmod(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c)
+{
+	pstm_int	x, y, u, v, B, D;
+	int32		res;
+	uint16		neg, sanity;
+
+	/* 2. [modified] b must be odd   */
+	if (pstm_iseven (b) == 1) {
+		return pstm_invmod_slow(pool, a,b,c);
+	}
+
+	/* x == modulus, y == value to invert */
+	if ((res = pstm_init_copy(pool, &x, b, 0)) != PSTM_OKAY) {
+		return res;
+	}
+
+	if ((res = pstm_init_size(pool, &y, a->alloc)) != PSTM_OKAY) {
+		goto LBL_X;
+	}
+
+	/* we need y = |a| */
+	pstm_abs(a, &y);
+
+	/* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
+	if ((res = pstm_init_copy(pool, &u, &x, 0)) != PSTM_OKAY) {
+		goto LBL_Y;
+	}
+	if ((res = pstm_init_copy(pool, &v, &y, 0)) != PSTM_OKAY) {
+		goto LBL_U;
+	}
+	if ((res = pstm_init(pool, &B)) != PSTM_OKAY) {
+		goto LBL_V;
+	}
+	if ((res = pstm_init(pool, &D)) != PSTM_OKAY) {
+		goto LBL_B;
+	}
+
+	pstm_set (&D, 1);
+
+	sanity = 0;
+top:
+	/* 4.  while u is even do */
+	while (pstm_iseven (&u) == 1) {
+		/* 4.1 u = u/2 */
+		if ((res = pstm_div_2 (&u, &u)) != PSTM_OKAY) {
+			goto LBL_D;
+		}
+
+		/* 4.2 if B is odd then */
+		if (pstm_isodd (&B) == 1) {
+			if ((res = pstm_sub (&B, &x, &B)) != PSTM_OKAY) {
+				goto LBL_D;
+			}
+		}
+		/* B = B/2 */
+		if ((res = pstm_div_2 (&B, &B)) !=  PSTM_OKAY) {
+			goto LBL_D;
+		}
+	}
+
+	/* 5.  while v is even do */
+	while (pstm_iseven (&v) == 1) {
+		/* 5.1 v = v/2 */
+		if ((res = pstm_div_2 (&v, &v)) != PSTM_OKAY) {
+			goto LBL_D;
+		}
+		/* 5.2 if D is odd then */
+		if (pstm_isodd (&D) == 1) {
+			/* D = (D-x)/2 */
+			if ((res = pstm_sub (&D, &x, &D)) != PSTM_OKAY) {
+				goto LBL_D;
+			}
+		}
+		/* D = D/2 */
+		if ((res = pstm_div_2 (&D, &D)) !=  PSTM_OKAY) {
+			goto LBL_D;
+		}
+	}
+
+	/* 6.  if u >= v then */
+	if (pstm_cmp (&u, &v) != PSTM_LT) {
+		/* u = u - v, B = B - D */
+		if ((res = pstm_sub (&u, &v, &u)) != PSTM_OKAY) {
+				goto LBL_D;
+		}
+		if ((res = pstm_sub (&B, &D, &B)) != PSTM_OKAY) {
+				goto LBL_D;
+		}
+	} else {
+		/* v - v - u, D = D - B */
+		if ((res = pstm_sub (&v, &u, &v)) != PSTM_OKAY) {
+				goto LBL_D;
+		}
+		if ((res = pstm_sub (&D, &B, &D)) != PSTM_OKAY) {
+				goto LBL_D;
+		}
+	}
+
+	/* if not zero goto step 4 */
+	if (sanity++ > 1000) {
+		res = PS_LIMIT_FAIL;
+		goto LBL_D;
+	}
+	if (pstm_iszero (&u) == 0) {
+		goto top;
+	}
+
+	/* now a = C, b = D, gcd == g*v */
+
+	/* if v != 1 then there is no inverse */
+	if (pstm_cmp_d (&v, 1) != PSTM_EQ) {
+		res = PS_FAILURE;
+		goto LBL_D;
+	}
+
+	/* b is now the inverse */
+	neg = a->sign;
+	while (D.sign == PSTM_NEG) {
+		if ((res = pstm_add (&D, b, &D)) != PSTM_OKAY) {
+			goto LBL_D;
+		}
+	}
+	if ((res = pstm_copy (&D, c)) != PSTM_OKAY) {
+		goto LBL_D;
+	}
+	c->sign = neg;
+	res = PSTM_OKAY;
+
+LBL_D: pstm_clear(&D);
+LBL_B: pstm_clear(&B);
+LBL_V: pstm_clear(&v);
+LBL_U: pstm_clear(&u);
+LBL_Y: pstm_clear(&y);
+LBL_X: pstm_clear(&x);
+	return res;
+}
+#endif /* !DISABLE_PSTM */
+/******************************************************************************/
diff --git a/networking/tls_pstm.h b/networking/tls_pstm.h
new file mode 100644
index 0000000..1affc1b
--- /dev/null
+++ b/networking/tls_pstm.h
@@ -0,0 +1,238 @@
+/**
+ *	@file    pstm.h
+ *	@version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
+ *
+ *	multiple-precision integer library.
+ */
+/*
+ *	Copyright (c) 2013-2015 INSIDE Secure Corporation
+ *	Copyright (c) PeerSec Networks, 2002-2011
+ *	All Rights Reserved
+ *
+ *	The latest version of this code is available at http://www.matrixssl.org
+ *
+ *	This software is open source; you can redistribute it and/or modify
+ *	it under the terms of the GNU General Public License as published by
+ *	the Free Software Foundation; either version 2 of the License, or
+ *	(at your option) any later version.
+ *
+ *	This General Public License does NOT permit incorporating this software
+ *	into proprietary programs.  If you are unable to comply with the GPL, a
+ *	commercial license for this software may be purchased from INSIDE at
+ *	http://www.insidesecure.com/eng/Company/Locations
+ *
+ *	This program is distributed in WITHOUT ANY WARRANTY; without even the
+ *	implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *	See the GNU General Public License for more details.
+ *
+ *	You should have received a copy of the GNU General Public License
+ *	along with this program; if not, write to the Free Software
+ *	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *	http://www.gnu.org/copyleft/gpl.html
+ */
+/******************************************************************************/
+
+#ifndef _h_PSTMATH
+#define _h_PSTMATH
+#ifndef DISABLE_PSTM
+
+/* Define this here to avoid including circular limits.h on some platforms */
+#ifndef CHAR_BIT
+#define CHAR_BIT	8
+#endif
+
+/******************************************************************************/
+/*
+	If native 64 bit integers are not supported, we do not support 32x32->64
+	in hardware, so we must set the 16 bit flag to produce 16x16->32 products.
+*/
+#ifndef HAVE_NATIVE_INT64
+	#define PSTM_16BIT
+#endif /* ! HAVE_NATIVE_INT64 */
+
+/******************************************************************************/
+/*
+	Some default configurations.
+
+	pstm_word should be the largest value the processor can hold as the product
+		of a multiplication. Most platforms support a 32x32->64 MAC instruction,
+		so 64bits is the default pstm_word size.
+	pstm_digit should be half the size of pstm_word
+ */
+#ifdef PSTM_8BIT
+/*	8-bit digits, 16-bit word products */
+	typedef unsigned char		pstm_digit;
+	typedef unsigned short		pstm_word;
+	#define DIGIT_BIT			8
+
+#elif defined(PSTM_16BIT)
+/*	16-bit digits, 32-bit word products */
+	typedef unsigned short		pstm_digit;
+	typedef unsigned long		pstm_word;
+	#define	DIGIT_BIT			16
+
+#elif defined(PSTM_64BIT)
+/*	64-bit digits, 128-bit word products */
+	#ifndef __GNUC__
+	#error "64bit digits requires GCC"
+	#endif
+	typedef unsigned long		pstm_digit;
+	typedef unsigned long		pstm_word __attribute__ ((mode(TI)));
+	#define DIGIT_BIT			64
+
+#else
+/*	This is the default case, 32-bit digits, 64-bit word products */
+	typedef uint32			pstm_digit;
+	typedef uint64			pstm_word;
+	#define DIGIT_BIT		32
+	#define PSTM_32BIT
+#endif /* digit and word size */
+
+#define PSTM_MASK			(pstm_digit)(-1)
+#define PSTM_DIGIT_MAX		PSTM_MASK
+
+/******************************************************************************/
+/*
+	equalities
+ */
+#define PSTM_LT			-1		/* less than */
+#define PSTM_EQ			0		/* equal to */
+#define PSTM_GT			1		/* greater than */
+
+#define PSTM_ZPOS		0		/* positive integer */
+#define PSTM_NEG		1		/* negative */
+
+#define PSTM_OKAY		PS_SUCCESS
+#define PSTM_MEM		PS_MEM_FAIL
+
+/******************************************************************************/
+/*
+	Various build options
+ */
+#define PSTM_DEFAULT_INIT 64		/* default (64) digits of allocation */
+#define PSTM_MAX_SIZE	4096
+
+typedef struct  {
+	int16	used, alloc, sign;
+	pstm_digit	*dp;
+	psPool_t	*pool;
+} pstm_int;
+
+/******************************************************************************/
+/*
+	Operations on large integers
+ */
+#define pstm_iszero(a) (((a)->used == 0) ? PS_TRUE : PS_FALSE)
+#define pstm_iseven(a) (((a)->used > 0 && (((a)->dp[0] & 1) == 0)) ? PS_TRUE : PS_FALSE)
+#define pstm_isodd(a)  (((a)->used > 0 && (((a)->dp[0] & 1) == 1)) ? PS_TRUE : PS_FALSE)
+#define pstm_abs(a, b)  { pstm_copy(a, b); (b)->sign  = 0; }
+
+extern void pstm_set(pstm_int *a, pstm_digit b);
+
+extern void pstm_zero(pstm_int * a);
+
+extern int32 pstm_init(psPool_t *pool, pstm_int * a);
+
+extern int32 pstm_init_size(psPool_t *pool, pstm_int * a, uint32 size);
+
+extern int32 pstm_init_copy(psPool_t *pool, pstm_int * a, pstm_int * b,
+				int16 toSqr);
+
+extern int16 pstm_count_bits (pstm_int * a);
+
+extern int32 pstm_init_for_read_unsigned_bin(psPool_t *pool, pstm_int *a,
+				uint32 len);
+
+extern int32 pstm_read_unsigned_bin(pstm_int *a, unsigned char *b, int32 c);
+
+extern int32 pstm_unsigned_bin_size(pstm_int *a);
+
+extern int32 pstm_copy(pstm_int * a, pstm_int * b);
+
+extern void pstm_exch(pstm_int * a, pstm_int * b);
+
+extern void pstm_clear(pstm_int * a);
+
+extern void pstm_clear_multi(pstm_int *mp0, pstm_int *mp1, pstm_int *mp2,
+				pstm_int *mp3, pstm_int *mp4, pstm_int *mp5, pstm_int *mp6,
+				pstm_int *mp7);
+
+extern int32 pstm_grow(pstm_int * a, int16 size);
+
+extern void pstm_clamp(pstm_int * a);
+
+extern int32 pstm_cmp(pstm_int * a, pstm_int * b);
+
+extern int32 pstm_cmp_mag(pstm_int * a, pstm_int * b);
+
+extern void pstm_rshd(pstm_int *a, int16 x);
+
+extern int32 pstm_lshd(pstm_int * a, int16 b);
+
+extern int32 pstm_div(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c,
+				pstm_int *d);
+
+extern int32 pstm_div_2d(psPool_t *pool, pstm_int *a, int16 b, pstm_int *c,
+				pstm_int *d);
+
+extern int32 pstm_div_2(pstm_int * a, pstm_int * b);
+
+extern int32 s_pstm_sub(pstm_int *a, pstm_int *b, pstm_int *c);
+
+extern int32 pstm_sub(pstm_int *a, pstm_int *b, pstm_int *c);
+
+extern int32 pstm_sub_d(psPool_t *pool, pstm_int *a, pstm_digit b, pstm_int *c);
+
+extern int32 pstm_mul_2(pstm_int * a, pstm_int * b);
+
+extern int32 pstm_mod(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c);
+
+extern int32 pstm_mulmod(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c,
+				pstm_int *d);
+
+extern int32 pstm_exptmod(psPool_t *pool, pstm_int *G, pstm_int *X, pstm_int *P,
+				pstm_int *Y);
+
+extern int32 pstm_2expt(pstm_int *a, int16 b);
+
+extern int32 pstm_add(pstm_int *a, pstm_int *b, pstm_int *c);
+
+extern int32 pstm_to_unsigned_bin(psPool_t *pool, pstm_int *a,
+				unsigned char *b);
+
+extern int32 pstm_to_unsigned_bin_nr(psPool_t *pool, pstm_int *a,
+				unsigned char *b);
+
+extern int32 pstm_montgomery_setup(pstm_int *a, pstm_digit *rho);
+
+///bbox: pool unused
+#define pstm_montgomery_reduce(pool, a, m, mp, paD, paDlen) \
+        pstm_montgomery_reduce(      a, m, mp, paD, paDlen)
+extern int32 pstm_montgomery_reduce(psPool_t *pool, pstm_int *a, pstm_int *m,
+				pstm_digit mp, pstm_digit *paD, uint32 paDlen);
+
+#define pstm_mul_comba(pool, A, B, C, paD, paDlen) \
+        pstm_mul_comba(      A, B, C, paD, paDlen)
+extern int32 pstm_mul_comba(psPool_t *pool, pstm_int *A, pstm_int *B,
+				pstm_int *C, pstm_digit *paD, uint32 paDlen);
+
+///bbox: pool unused
+#define pstm_sqr_comba(pool, A, B, paD, paDlen) \
+        pstm_sqr_comba(      A, B, paD, paDlen)
+extern int32 pstm_sqr_comba(psPool_t *pool, pstm_int *A, pstm_int *B,
+				pstm_digit *paD, uint32 paDlen);
+
+extern int32 pstm_cmp_d(pstm_int *a, pstm_digit b);
+
+extern int32 pstm_montgomery_calc_normalization(pstm_int *a, pstm_int *b);
+
+extern int32 pstm_mul_d(pstm_int *a, pstm_digit b, pstm_int *c);
+
+extern int32 pstm_invmod(psPool_t *pool, pstm_int * a, pstm_int * b,
+				pstm_int * c);
+
+#else /* DISABLE_PSTM */
+	typedef int32 pstm_int;
+#endif /* !DISABLE_PSTM */
+#endif /* _h_PSTMATH */
+
diff --git a/networking/tls_pstm_montgomery_reduce.c b/networking/tls_pstm_montgomery_reduce.c
new file mode 100644
index 0000000..c231c4d
--- /dev/null
+++ b/networking/tls_pstm_montgomery_reduce.c
@@ -0,0 +1,423 @@
+/*
+ * Copyright (C) 2017 Denys Vlasenko
+ *
+ * Licensed under GPLv2, see file LICENSE in this source tree.
+ */
+#include "tls.h"
+
+/**
+ *	@file    pstm_montgomery_reduce.c
+ *	@version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
+ *
+ *	Multiprecision Montgomery Reduction.
+ */
+/*
+ *	Copyright (c) 2013-2015 INSIDE Secure Corporation
+ *	Copyright (c) PeerSec Networks, 2002-2011
+ *	All Rights Reserved
+ *
+ *	The latest version of this code is available at http://www.matrixssl.org
+ *
+ *	This software is open source; you can redistribute it and/or modify
+ *	it under the terms of the GNU General Public License as published by
+ *	the Free Software Foundation; either version 2 of the License, or
+ *	(at your option) any later version.
+ *
+ *	This General Public License does NOT permit incorporating this software
+ *	into proprietary programs.  If you are unable to comply with the GPL, a
+ *	commercial license for this software may be purchased from INSIDE at
+ *	http://www.insidesecure.com/eng/Company/Locations
+ *
+ *	This program is distributed in WITHOUT ANY WARRANTY; without even the
+ *	implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *	See the GNU General Public License for more details.
+ *
+ *	You should have received a copy of the GNU General Public License
+ *	along with this program; if not, write to the Free Software
+ *	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *	http://www.gnu.org/copyleft/gpl.html
+ */
+/******************************************************************************/
+
+///bbox
+//#include "../cryptoApi.h"
+#ifndef DISABLE_PSTM
+
+/******************************************************************************/
+
+#if defined(PSTM_X86)
+/* x86-32 optimized for 32 bit platforms. For 64 bit mode use X86_64 instead */
+#if !defined(__GNUC__) || !defined(__i386__) || !defined(PSTM_32BIT)
+#error "PSTM_X86 option requires GCC and 32 bit mode x86 processor"
+#endif
+//#pragma message ("Using 32 bit x86 Assembly Optimizations")
+
+#define MONT_START
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+   mu = c[x] * mp
+
+#define INNERMUL                                          \
+asm(                                                      \
+   "movl %5,%%eax \n\t"                                   \
+   "mull %4       \n\t"                                   \
+   "addl %1,%%eax \n\t"                                   \
+   "adcl $0,%%edx \n\t"                                   \
+   "addl %%eax,%0 \n\t"                                   \
+   "adcl $0,%%edx \n\t"                                   \
+   "movl %%edx,%1 \n\t"                                   \
+:"=g"(_c[LO]), "=r"(cy)                                   \
+:"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++)              \
+: "%eax", "%edx", "%cc")
+
+#define PROPCARRY                           \
+asm(                                        \
+   "addl   %1,%0    \n\t"                   \
+   "setb   %%al     \n\t"                   \
+   "movzbl %%al,%1 \n\t"                    \
+:"=g"(_c[LO]), "=r"(cy)                     \
+:"0"(_c[LO]), "1"(cy)                       \
+: "%eax", "%cc")
+
+/******************************************************************************/
+#elif defined(PSTM_X86_64)
+/* x86-64 optimized */
+#if !defined(__GNUC__) || !defined(__x86_64__) || !defined(PSTM_64BIT)
+#error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor"
+#endif
+//#pragma message ("Using 64 bit x86_64 Assembly Optimizations")
+
+#define MONT_START
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+mu = c[x] * mp
+
+#define INNERMUL                                           \
+asm(                                                       \
+	"movq %5,%%rax \n\t"                                   \
+	"mulq %4       \n\t"                                   \
+	"addq %1,%%rax \n\t"                                   \
+	"adcq $0,%%rdx \n\t"                                   \
+	"addq %%rax,%0 \n\t"                                   \
+	"adcq $0,%%rdx \n\t"                                   \
+	"movq %%rdx,%1 \n\t"                                   \
+	:"=g"(_c[LO]), "=r"(cy)                                \
+	:"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++)           \
+	: "%rax", "%rdx", "cc")
+
+#define INNERMUL8				\
+asm(							\
+	"movq 0(%5),%%rax    \n\t"  \
+	"movq 0(%2),%%r10    \n\t"  \
+	"movq 0x8(%5),%%r11  \n\t"  \
+	"mulq %4             \n\t"  \
+	"addq %%r10,%%rax    \n\t"  \
+	"adcq $0,%%rdx       \n\t"  \
+	"movq 0x8(%2),%%r10  \n\t"  \
+	"addq %3,%%rax       \n\t"  \
+	"adcq $0,%%rdx       \n\t"  \
+	"movq %%rax,0(%0)    \n\t"  \
+	"movq %%rdx,%1       \n\t"  \
+	\
+	"movq %%r11,%%rax    \n\t"  \
+	"movq 0x10(%5),%%r11 \n\t"  \
+	"mulq %4             \n\t"  \
+	"addq %%r10,%%rax    \n\t"  \
+	"adcq $0,%%rdx       \n\t"  \
+	"movq 0x10(%2),%%r10 \n\t"  \
+	"addq %3,%%rax       \n\t"  \
+	"adcq $0,%%rdx       \n\t"  \
+	"movq %%rax,0x8(%0)  \n\t"  \
+	"movq %%rdx,%1       \n\t"  \
+	\
+	"movq %%r11,%%rax    \n\t"  \
+	"movq 0x18(%5),%%r11 \n\t"  \
+	"mulq %4             \n\t"  \
+	"addq %%r10,%%rax    \n\t"  \
+	"adcq $0,%%rdx       \n\t"  \
+	"movq 0x18(%2),%%r10 \n\t"  \
+	"addq %3,%%rax       \n\t"  \
+	"adcq $0,%%rdx       \n\t"  \
+	"movq %%rax,0x10(%0) \n\t"  \
+	"movq %%rdx,%1       \n\t"  \
+	\
+	"movq %%r11,%%rax    \n\t"  \
+	"movq 0x20(%5),%%r11 \n\t"  \
+	"mulq %4             \n\t"  \
+	"addq %%r10,%%rax    \n\t"  \
+	"adcq $0,%%rdx       \n\t"  \
+	"movq 0x20(%2),%%r10 \n\t"  \
+	"addq %3,%%rax       \n\t"  \
+	"adcq $0,%%rdx       \n\t"  \
+	"movq %%rax,0x18(%0) \n\t"  \
+	"movq %%rdx,%1       \n\t"  \
+	\
+	"movq %%r11,%%rax    \n\t"  \
+	"movq 0x28(%5),%%r11 \n\t"  \
+	"mulq %4             \n\t"  \
+	"addq %%r10,%%rax    \n\t"  \
+	"adcq $0,%%rdx       \n\t"  \
+	"movq 0x28(%2),%%r10 \n\t"  \
+	"addq %3,%%rax       \n\t"  \
+	"adcq $0,%%rdx       \n\t"  \
+	"movq %%rax,0x20(%0) \n\t"  \
+	"movq %%rdx,%1       \n\t"  \
+	\
+	"movq %%r11,%%rax    \n\t"  \
+	"movq 0x30(%5),%%r11 \n\t"  \
+	"mulq %4             \n\t"  \
+	"addq %%r10,%%rax    \n\t"  \
+	"adcq $0,%%rdx       \n\t"  \
+	"movq 0x30(%2),%%r10 \n\t"  \
+	"addq %3,%%rax       \n\t"  \
+	"adcq $0,%%rdx       \n\t"  \
+	"movq %%rax,0x28(%0) \n\t"  \
+	"movq %%rdx,%1       \n\t"  \
+	\
+	"movq %%r11,%%rax    \n\t"  \
+	"movq 0x38(%5),%%r11 \n\t"  \
+	"mulq %4             \n\t"  \
+	"addq %%r10,%%rax    \n\t"  \
+	"adcq $0,%%rdx       \n\t"  \
+	"movq 0x38(%2),%%r10 \n\t"  \
+	"addq %3,%%rax       \n\t"  \
+	"adcq $0,%%rdx       \n\t"  \
+	"movq %%rax,0x30(%0) \n\t"  \
+	"movq %%rdx,%1       \n\t"  \
+	\
+	"movq %%r11,%%rax    \n\t"  \
+	"mulq %4             \n\t"  \
+	"addq %%r10,%%rax    \n\t"  \
+	"adcq $0,%%rdx       \n\t"  \
+	"addq %3,%%rax       \n\t"  \
+	"adcq $0,%%rdx       \n\t"  \
+	"movq %%rax,0x38(%0) \n\t"  \
+	"movq %%rdx,%1       \n\t"  \
+	\
+	:"=r"(_c), "=r"(cy)                    \
+	: "0"(_c),  "1"(cy), "g"(mu), "r"(tmpm)\
+	: "%rax", "%rdx", "%r10", "%r11", "cc")
+
+#define PROPCARRY                          \
+asm(                                       \
+	"addq   %1,%0    \n\t"                 \
+	"setb   %%al     \n\t"                 \
+	"movzbq %%al,%1 \n\t"                  \
+	:"=g"(_c[LO]), "=r"(cy)                \
+	:"0"(_c[LO]), "1"(cy)                  \
+	: "%rax", "cc")
+
+/******************************************************************************/
+#elif defined(PSTM_ARM)
+
+#define MONT_START
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+mu = c[x] * mp
+
+#ifdef __thumb2__
+//#pragma message ("Using 32 bit ARM Thumb2 Assembly Optimizations")
+#define INNERMUL                    \
+asm(                                \
+	" LDR    r0,%1            \n\t" \
+	" ADDS   r0,r0,%0         \n\t" \
+	" ITE CS                  \n\t" \
+	" MOVCS  %0,#1            \n\t" \
+	" MOVCC  %0,#0            \n\t" \
+	" UMLAL  r0,%0,%3,%4      \n\t" \
+	" STR    r0,%1            \n\t" \
+	:"=r"(cy),"=m"(_c[0])\
+	:"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0])\
+	:"r0","%cc");
+#define PROPCARRY                  \
+asm(                               \
+	" LDR   r0,%1            \n\t" \
+	" ADDS  r0,r0,%0         \n\t" \
+	" STR   r0,%1            \n\t" \
+	" ITE CS                 \n\t" \
+	" MOVCS %0,#1            \n\t" \
+	" MOVCC %0,#0            \n\t" \
+	:"=r"(cy),"=m"(_c[0])\
+	:"0"(cy),"m"(_c[0])\
+	:"r0","%cc");
+#else /* Non-Thumb2 code */
+//#pragma message ("Using 32 bit ARM Assembly Optimizations")
+#define INNERMUL                    \
+asm(                                \
+	" LDR    r0,%1            \n\t" \
+	" ADDS   r0,r0,%0         \n\t" \
+	" MOVCS  %0,#1            \n\t" \
+	" MOVCC  %0,#0            \n\t" \
+	" UMLAL  r0,%0,%3,%4      \n\t" \
+	" STR    r0,%1            \n\t" \
+	:"=r"(cy),"=m"(_c[0])\
+	:"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0])\
+	:"r0","%cc");
+#define PROPCARRY                  \
+asm(                               \
+	" LDR   r0,%1            \n\t" \
+	" ADDS  r0,r0,%0         \n\t" \
+	" STR   r0,%1            \n\t" \
+	" MOVCS %0,#1            \n\t" \
+	" MOVCC %0,#0            \n\t" \
+	:"=r"(cy),"=m"(_c[0])\
+	:"0"(cy),"m"(_c[0])\
+	:"r0","%cc");
+#endif /* __thumb2__ */
+
+
+/******************************************************************************/
+#elif defined(PSTM_MIPS)
+/* MIPS32 */
+//#pragma message ("Using 32 bit MIPS Assembly Optimizations")
+#define MONT_START
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+mu = c[x] * mp
+
+#define INNERMUL                      \
+asm(								  \
+	" multu    %3,%4          \n\t"   \
+	" mflo     $12            \n\t"   \
+	" mfhi     $13            \n\t"   \
+	" addu     $12,$12,%0     \n\t"   \
+	" sltu     $10,$12,%0     \n\t"   \
+	" addu     $13,$13,$10    \n\t"   \
+	" lw       $10,%1         \n\t"   \
+	" addu     $12,$12,$10    \n\t"   \
+	" sltu     $10,$12,$10    \n\t"   \
+	" addu     %0,$13,$10     \n\t"   \
+	" sw       $12,%1         \n\t"   \
+	:"=r"(cy),"=m"(_c[0])\
+	:"r"(cy),"r"(mu),"r"(tmpm[0]),"r"(_c[0])\
+	:"$10","$12","$13")\
+; ++tmpm;
+
+#define PROPCARRY                     \
+asm(                                  \
+	" lw       $10,%1        \n\t"    \
+	" addu     $10,$10,%0    \n\t"    \
+	" sw       $10,%1        \n\t"    \
+	" sltu     %0,$10,%0     \n\t"    \
+	:"=r"(cy),"=m"(_c[0])\
+	:"r"(cy),"r"(_c[0])\
+	:"$10");
+
+
+/******************************************************************************/
+#else
+
+/* ISO C code */
+#define MONT_START
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+   mu = c[x] * mp
+
+#define INNERMUL										\
+	do { pstm_word t;									\
+		t = ((pstm_word)_c[0] + (pstm_word)cy) +		\
+			(((pstm_word)mu) * ((pstm_word)*tmpm++));	\
+		_c[0] = (pstm_digit)t;							\
+		cy = (pstm_digit)(t >> DIGIT_BIT);				\
+	} while (0)
+
+#define PROPCARRY \
+   do { pstm_digit t = _c[0] += cy; cy = (t < cy); } while (0)
+
+#endif
+
+/******************************************************************************/
+
+#define LO 0
+
+/* computes x/R == x (mod N) via Montgomery Reduction */
+int32 pstm_montgomery_reduce(psPool_t *pool, pstm_int *a, pstm_int *m,
+		pstm_digit mp, pstm_digit *paD, uint32 paDlen)
+{
+	pstm_digit	*c, *_c, *tmpm, mu;
+	int32		oldused, x, y;
+	int16		pa;
+
+	pa = m->used;
+	if (pa > a->alloc) {
+		/* Sanity test for bad numbers.  This will confirm no buffer overruns */
+		return PS_LIMIT_FAIL;
+	}
+
+	if (paD && paDlen >= (uint32)2*pa+1) {
+		c = paD;
+		memset(c, 0x0, paDlen);
+	} else {
+		c = xzalloc(2*pa+1);
+	}
+	/* copy the input */
+	oldused = a->used;
+	for (x = 0; x < oldused; x++) {
+		c[x] = a->dp[x];
+	}
+
+	MONT_START;
+
+	for (x = 0; x < pa; x++) {
+		pstm_digit cy = 0;
+		/* get Mu for this round */
+		LOOP_START;
+		_c   = c + x;
+		tmpm = m->dp;
+		y = 0;
+#ifdef PSTM_X86_64
+		for (; y < (pa & ~7); y += 8) {
+			INNERMUL8;
+			_c   += 8;
+			tmpm += 8;
+		}
+#endif /* PSTM_X86_64 */
+		for (; y < pa; y++) {
+			INNERMUL;
+			++_c;
+		}
+		LOOP_END;
+		while (cy) {
+			PROPCARRY;
+			++_c;
+		}
+	}
+
+	/* now copy out */
+	_c   = c + pa;
+	tmpm = a->dp;
+	for (x = 0; x < pa+1; x++) {
+		*tmpm++ = *_c++;
+	}
+
+	for (; x < oldused; x++)   {
+		*tmpm++ = 0;
+	}
+
+	MONT_FINI;
+
+	a->used = pa+1;
+	pstm_clamp(a);
+
+	/* reuse x as return code */
+	x = PSTM_OKAY;
+
+	/* if A >= m then A = A - m */
+	if (pstm_cmp_mag (a, m) != PSTM_LT) {
+		if (s_pstm_sub (a, m, a) != PSTM_OKAY) {
+			x = PS_MEM_FAIL;
+		}
+	}
+	if (paDlen < (uint32)2*pa+1) {
+		psFree(c, pool);
+	}
+	return x;
+}
+
+#endif /* !DISABLE_PSTM */
+/******************************************************************************/
diff --git a/networking/tls_pstm_mul_comba.c b/networking/tls_pstm_mul_comba.c
new file mode 100644
index 0000000..6e051ba
--- /dev/null
+++ b/networking/tls_pstm_mul_comba.c
@@ -0,0 +1,777 @@
+/*
+ * Copyright (C) 2017 Denys Vlasenko
+ *
+ * Licensed under GPLv2, see file LICENSE in this source tree.
+ */
+#include "tls.h"
+
+/**
+ *	@file    pstm_mul_comba.c
+ *	@version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
+ *
+ *	Multiprecision multiplication with Comba technique.
+ */
+/*
+ *	Copyright (c) 2013-2015 INSIDE Secure Corporation
+ *	Copyright (c) PeerSec Networks, 2002-2011
+ *	All Rights Reserved
+ *
+ *	The latest version of this code is available at http://www.matrixssl.org
+ *
+ *	This software is open source; you can redistribute it and/or modify
+ *	it under the terms of the GNU General Public License as published by
+ *	the Free Software Foundation; either version 2 of the License, or
+ *	(at your option) any later version.
+ *
+ *	This General Public License does NOT permit incorporating this software
+ *	into proprietary programs.  If you are unable to comply with the GPL, a
+ *	commercial license for this software may be purchased from INSIDE at
+ *	http://www.insidesecure.com/eng/Company/Locations
+ *
+ *	This program is distributed in WITHOUT ANY WARRANTY; without even the
+ *	implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *	See the GNU General Public License for more details.
+ *
+ *	You should have received a copy of the GNU General Public License
+ *	along with this program; if not, write to the Free Software
+ *	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *	http://www.gnu.org/copyleft/gpl.html
+ */
+/******************************************************************************/
+
+///bbox
+//#include "../cryptoApi.h"
+#ifndef DISABLE_PSTM
+
+/******************************************************************************/
+#if defined(PSTM_X86)
+/* x86-32 optimized for 32 bit platforms. For 64 bit mode use X86_64 instead */
+#if !defined(__GNUC__) || !defined(__i386__) || !defined(PSTM_32BIT)
+#error "PSTM_X86 option requires GCC and 32 bit mode x86 processor"
+#endif
+//#pragma message ("Using 32 bit x86 Assembly Optimizations")
+
+/* anything you need at the start */
+#define COMBA_START
+
+/* clear the chaining variables */
+#define COMBA_CLEAR \
+   c0 = c1 = c2 = 0;
+
+/* forward the carry to the next digit */
+#define COMBA_FORWARD \
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+/* store the first sum */
+#define COMBA_STORE(x) \
+   x = c0;
+
+/* store the second sum [carry] */
+#define COMBA_STORE2(x) \
+   x = c1;
+
+/* anything you need at the end */
+#define COMBA_FINI
+
+/* this should multiply i and j  */
+#define MULADD(i, j)                                      \
+asm(                                                      \
+	 "movl  %6,%%eax     \n\t"                            \
+	 "mull  %7           \n\t"                            \
+	 "addl  %%eax,%0     \n\t"                            \
+	 "adcl  %%edx,%1     \n\t"                            \
+	 "adcl  $0,%2        \n\t"                            \
+	 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","%cc");
+
+/******************************************************************************/
+#elif defined(PSTM_X86_64)
+/* x86-64 optimized */
+#if !defined(__GNUC__) || !defined(__x86_64__) || !defined(PSTM_64BIT)
+#error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor"
+#endif
+//#pragma message ("Using 64 bit x86_64 Assembly Optimizations")
+
+/* anything you need at the start */
+#define COMBA_START
+
+/* clear the chaining variables */
+#define COMBA_CLEAR \
+c0 = c1 = c2 = 0;
+
+/* forward the carry to the next digit */
+#define COMBA_FORWARD \
+do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+/* store the first sum */
+#define COMBA_STORE(x) \
+x = c0;
+
+/* store the second sum [carry] */
+#define COMBA_STORE2(x) \
+x = c1;
+
+/* anything you need at the end */
+#define COMBA_FINI
+
+/* this should multiply i and j  */
+#define MULADD(i, j)									\
+asm  (													\
+	"movq  %6,%%rax     \n\t"                            \
+	"mulq  %7           \n\t"                            \
+	"addq  %%rax,%0     \n\t"                            \
+	"adcq  %%rdx,%1     \n\t"                            \
+	"adcq  $0,%2        \n\t"                            \
+	:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","cc");
+
+/******************************************************************************/
+#elif defined(PSTM_ARM)
+/* ARM code */
+//#pragma message ("Using 32 bit ARM Assembly Optimizations")
+
+#define COMBA_START
+
+#define COMBA_CLEAR \
+c0 = c1 = c2 = 0;
+
+#define COMBA_FORWARD \
+do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+#define COMBA_STORE(x) \
+x = c0;
+
+#define COMBA_STORE2(x) \
+x = c1;
+
+#define COMBA_FINI
+
+#define MULADD(i, j)                                          \
+asm(                                                          \
+	"  UMULL  r0,r1,%6,%7           \n\t"                     \
+	"  ADDS   %0,%0,r0              \n\t"                     \
+	"  ADCS   %1,%1,r1              \n\t"                     \
+	"  ADC    %2,%2,#0              \n\t"                     \
+	:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "%cc");
+
+/******************************************************************************/
+#elif defined(PSTM_MIPS)
+/* MIPS32 code */
+//#pragma message ("Using 32 bit MIPS Assembly Optimizations")
+
+#define COMBA_START
+
+#define COMBA_CLEAR \
+c0 = c1 = c2 = 0;
+
+#define COMBA_FORWARD \
+do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+#define COMBA_STORE(x) \
+x = c0;
+
+#define COMBA_STORE2(x) \
+x = c1;
+
+#define COMBA_FINI
+
+#define MULADD(i, j)               \
+asm(                               \
+	" multu  %6,%7          \n\t"  \
+	" mflo   $12            \n\t"  \
+	" mfhi   $13            \n\t"  \
+	" addu    %0,%0,$12     \n\t"  \
+	" sltu   $12,%0,$12     \n\t"  \
+	" addu    %1,%1,$13     \n\t"  \
+	" sltu   $13,%1,$13     \n\t"  \
+	" addu    %1,%1,$12     \n\t"  \
+	" sltu   $12,%1,$12     \n\t"  \
+	" addu    %2,%2,$13     \n\t"  \
+	" addu    %2,%2,$12     \n\t"  \
+	:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12","$13");
+
+/******************************************************************************/
+#else
+
+#define COMBA_START
+
+#define COMBA_CLEAR \
+   c0 = c1 = c2 = 0;
+
+#define COMBA_FORWARD \
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+#define COMBA_STORE(x) \
+   x = c0;
+
+#define COMBA_STORE2(x) \
+   x = c1;
+
+#define COMBA_FINI
+
+#define MULADD(i, j)														\
+   do { pstm_word t;														\
+   t = (pstm_word)c0 + ((pstm_word)i) * ((pstm_word)j); c0 = (pstm_digit)t;	\
+   t = (pstm_word)c1 + (t >> DIGIT_BIT);									\
+   c1 = (pstm_digit)t; c2 += (pstm_digit)(t >> DIGIT_BIT);					\
+   } while (0);
+
+#endif
+
+/******************************************************************************/
+/* generic PxQ multiplier */
+///bbox: pool unused
+#define pstm_mul_comba_gen(pool, A, B, C, paD, paDlen) \
+        pstm_mul_comba_gen(      A, B, C, paD, paDlen)
+static int32 pstm_mul_comba_gen(psPool_t *pool, pstm_int *A, pstm_int *B,
+			pstm_int *C, pstm_digit *paD, uint32 paDlen)
+{
+	int16		paDfail, pa;
+	int32       ix, iy, iz, tx, ty;
+	pstm_digit	c0, c1, c2, *tmpx, *tmpy, *dst;
+
+	COMBA_START;
+	COMBA_CLEAR;
+
+	paDfail = 0;
+	/* get size of output and trim */
+	pa = A->used + B->used;
+
+/*
+	If c is not large enough grow it and continue
+*/
+	if (C->alloc < pa) {
+		if (pstm_grow(C, pa) != PSTM_OKAY) {
+			return PS_MEM_FAIL;
+		}
+	}
+	if (paD != NULL) {
+		if (paDlen < (sizeof(pstm_digit) * pa)) {
+			paDfail = 1; /* have a paD but it's not large enough */
+			dst = xzalloc(sizeof(pstm_digit) * pa);
+		} else {
+			dst = paD;
+			memset(dst, 0x0, paDlen);
+		}
+	} else {
+		dst = xzalloc(sizeof(pstm_digit) * pa);
+	}
+
+	for (ix = 0; ix < pa; ix++) {
+		/* get offsets into the two bignums */
+		ty = min(ix, B->used-1);
+		tx = ix - ty;
+
+		/* setup temp aliases */
+		tmpx = A->dp + tx;
+		tmpy = B->dp + ty;
+/*
+		This is the number of times the loop will iterate, essentially it's
+			while (tx++ < a->used && ty-- >= 0) { ... }
+*/
+		iy = min(A->used-tx, ty+1);
+
+		/* execute loop */
+		COMBA_FORWARD;
+		for (iz = 0; iz < iy; ++iz) {
+			MULADD(*tmpx++, *tmpy--);
+		}
+
+		/* store term */
+		COMBA_STORE(dst[ix]);
+	}
+	COMBA_FINI;
+/*
+	setup dest
+ */
+	iy  = C->used;
+	C->used = pa;
+	C->sign = A->sign ^ B->sign;
+	{
+		pstm_digit *tmpc;
+		tmpc = C->dp;
+		for (ix = 0; ix < pa; ix++) {
+			*tmpc++ = dst[ix];
+		}
+/*
+		clear unused digits [that existed in the old copy of c]
+ */
+		for (; ix < iy; ix++) {
+			*tmpc++ = 0;
+		}
+	}
+	pstm_clamp(C);
+
+	if ((paD == NULL) || (paDfail == 1)) {
+		psFree(dst, pool);
+	}
+
+	return PS_SUCCESS;
+}
+
+/******************************************************************************/
+#ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS
+static int32 pstm_mul_comba16(pstm_int *A, pstm_int *B, pstm_int *C)
+{
+	pstm_digit c0, c1, c2, at[32];
+
+	if (C->alloc < 32) {
+		if (pstm_grow(C, 32) != PSTM_OKAY) {
+			return PS_MEM_FAIL;
+		}
+	}
+	memcpy(at, A->dp, 16 * sizeof(pstm_digit));
+	memcpy(at+16, B->dp, 16 * sizeof(pstm_digit));
+
+   COMBA_START;
+
+   COMBA_CLEAR;
+   /* 0 */
+   MULADD(at[0], at[16]);
+   COMBA_STORE(C->dp[0]);
+   /* 1 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[17]);    MULADD(at[1], at[16]);
+   COMBA_STORE(C->dp[1]);
+   /* 2 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[18]);    MULADD(at[1], at[17]);    MULADD(at[2], at[16]);
+   COMBA_STORE(C->dp[2]);
+   /* 3 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[19]);    MULADD(at[1], at[18]);    MULADD(at[2], at[17]);    MULADD(at[3], at[16]);
+   COMBA_STORE(C->dp[3]);
+   /* 4 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[20]);    MULADD(at[1], at[19]);    MULADD(at[2], at[18]);    MULADD(at[3], at[17]);    MULADD(at[4], at[16]);
+   COMBA_STORE(C->dp[4]);
+   /* 5 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[21]);    MULADD(at[1], at[20]);    MULADD(at[2], at[19]);    MULADD(at[3], at[18]);    MULADD(at[4], at[17]);    MULADD(at[5], at[16]);
+   COMBA_STORE(C->dp[5]);
+   /* 6 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[22]);    MULADD(at[1], at[21]);    MULADD(at[2], at[20]);    MULADD(at[3], at[19]);    MULADD(at[4], at[18]);    MULADD(at[5], at[17]);    MULADD(at[6], at[16]);
+   COMBA_STORE(C->dp[6]);
+   /* 7 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[23]);    MULADD(at[1], at[22]);    MULADD(at[2], at[21]);    MULADD(at[3], at[20]);    MULADD(at[4], at[19]);    MULADD(at[5], at[18]);    MULADD(at[6], at[17]);    MULADD(at[7], at[16]);
+   COMBA_STORE(C->dp[7]);
+   /* 8 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[24]);    MULADD(at[1], at[23]);    MULADD(at[2], at[22]);    MULADD(at[3], at[21]);    MULADD(at[4], at[20]);    MULADD(at[5], at[19]);    MULADD(at[6], at[18]);    MULADD(at[7], at[17]);    MULADD(at[8], at[16]);
+   COMBA_STORE(C->dp[8]);
+   /* 9 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[25]);    MULADD(at[1], at[24]);    MULADD(at[2], at[23]);    MULADD(at[3], at[22]);    MULADD(at[4], at[21]);    MULADD(at[5], at[20]);    MULADD(at[6], at[19]);    MULADD(at[7], at[18]);    MULADD(at[8], at[17]);    MULADD(at[9], at[16]);
+   COMBA_STORE(C->dp[9]);
+   /* 10 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[26]);    MULADD(at[1], at[25]);    MULADD(at[2], at[24]);    MULADD(at[3], at[23]);    MULADD(at[4], at[22]);    MULADD(at[5], at[21]);    MULADD(at[6], at[20]);    MULADD(at[7], at[19]);    MULADD(at[8], at[18]);    MULADD(at[9], at[17]);    MULADD(at[10], at[16]);
+   COMBA_STORE(C->dp[10]);
+   /* 11 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[27]);    MULADD(at[1], at[26]);    MULADD(at[2], at[25]);    MULADD(at[3], at[24]);    MULADD(at[4], at[23]);    MULADD(at[5], at[22]);    MULADD(at[6], at[21]);    MULADD(at[7], at[20]);    MULADD(at[8], at[19]);    MULADD(at[9], at[18]);    MULADD(at[10], at[17]);    MULADD(at[11], at[16]);
+   COMBA_STORE(C->dp[11]);
+   /* 12 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[28]);    MULADD(at[1], at[27]);    MULADD(at[2], at[26]);    MULADD(at[3], at[25]);    MULADD(at[4], at[24]);    MULADD(at[5], at[23]);    MULADD(at[6], at[22]);    MULADD(at[7], at[21]);    MULADD(at[8], at[20]);    MULADD(at[9], at[19]);    MULADD(at[10], at[18]);    MULADD(at[11], at[17]);    MULADD(at[12], at[16]);
+   COMBA_STORE(C->dp[12]);
+   /* 13 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[29]);    MULADD(at[1], at[28]);    MULADD(at[2], at[27]);    MULADD(at[3], at[26]);    MULADD(at[4], at[25]);    MULADD(at[5], at[24]);    MULADD(at[6], at[23]);    MULADD(at[7], at[22]);    MULADD(at[8], at[21]);    MULADD(at[9], at[20]);    MULADD(at[10], at[19]);    MULADD(at[11], at[18]);    MULADD(at[12], at[17]);    MULADD(at[13], at[16]);
+   COMBA_STORE(C->dp[13]);
+   /* 14 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[30]);    MULADD(at[1], at[29]);    MULADD(at[2], at[28]);    MULADD(at[3], at[27]);    MULADD(at[4], at[26]);    MULADD(at[5], at[25]);    MULADD(at[6], at[24]);    MULADD(at[7], at[23]);    MULADD(at[8], at[22]);    MULADD(at[9], at[21]);    MULADD(at[10], at[20]);    MULADD(at[11], at[19]);    MULADD(at[12], at[18]);    MULADD(at[13], at[17]);    MULADD(at[14], at[16]);
+   COMBA_STORE(C->dp[14]);
+   /* 15 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[31]);    MULADD(at[1], at[30]);    MULADD(at[2], at[29]);    MULADD(at[3], at[28]);    MULADD(at[4], at[27]);    MULADD(at[5], at[26]);    MULADD(at[6], at[25]);    MULADD(at[7], at[24]);    MULADD(at[8], at[23]);    MULADD(at[9], at[22]);    MULADD(at[10], at[21]);    MULADD(at[11], at[20]);    MULADD(at[12], at[19]);    MULADD(at[13], at[18]);    MULADD(at[14], at[17]);    MULADD(at[15], at[16]);
+   COMBA_STORE(C->dp[15]);
+   /* 16 */
+   COMBA_FORWARD;
+   MULADD(at[1], at[31]);    MULADD(at[2], at[30]);    MULADD(at[3], at[29]);    MULADD(at[4], at[28]);    MULADD(at[5], at[27]);    MULADD(at[6], at[26]);    MULADD(at[7], at[25]);    MULADD(at[8], at[24]);    MULADD(at[9], at[23]);    MULADD(at[10], at[22]);    MULADD(at[11], at[21]);    MULADD(at[12], at[20]);    MULADD(at[13], at[19]);    MULADD(at[14], at[18]);    MULADD(at[15], at[17]);
+   COMBA_STORE(C->dp[16]);
+   /* 17 */
+   COMBA_FORWARD;
+   MULADD(at[2], at[31]);    MULADD(at[3], at[30]);    MULADD(at[4], at[29]);    MULADD(at[5], at[28]);    MULADD(at[6], at[27]);    MULADD(at[7], at[26]);    MULADD(at[8], at[25]);    MULADD(at[9], at[24]);    MULADD(at[10], at[23]);    MULADD(at[11], at[22]);    MULADD(at[12], at[21]);    MULADD(at[13], at[20]);    MULADD(at[14], at[19]);    MULADD(at[15], at[18]);
+   COMBA_STORE(C->dp[17]);
+   /* 18 */
+   COMBA_FORWARD;
+   MULADD(at[3], at[31]);    MULADD(at[4], at[30]);    MULADD(at[5], at[29]);    MULADD(at[6], at[28]);    MULADD(at[7], at[27]);    MULADD(at[8], at[26]);    MULADD(at[9], at[25]);    MULADD(at[10], at[24]);    MULADD(at[11], at[23]);    MULADD(at[12], at[22]);    MULADD(at[13], at[21]);    MULADD(at[14], at[20]);    MULADD(at[15], at[19]);
+   COMBA_STORE(C->dp[18]);
+   /* 19 */
+   COMBA_FORWARD;
+   MULADD(at[4], at[31]);    MULADD(at[5], at[30]);    MULADD(at[6], at[29]);    MULADD(at[7], at[28]);    MULADD(at[8], at[27]);    MULADD(at[9], at[26]);    MULADD(at[10], at[25]);    MULADD(at[11], at[24]);    MULADD(at[12], at[23]);    MULADD(at[13], at[22]);    MULADD(at[14], at[21]);    MULADD(at[15], at[20]);
+   COMBA_STORE(C->dp[19]);
+   /* 20 */
+   COMBA_FORWARD;
+   MULADD(at[5], at[31]);    MULADD(at[6], at[30]);    MULADD(at[7], at[29]);    MULADD(at[8], at[28]);    MULADD(at[9], at[27]);    MULADD(at[10], at[26]);    MULADD(at[11], at[25]);    MULADD(at[12], at[24]);    MULADD(at[13], at[23]);    MULADD(at[14], at[22]);    MULADD(at[15], at[21]);
+   COMBA_STORE(C->dp[20]);
+   /* 21 */
+   COMBA_FORWARD;
+   MULADD(at[6], at[31]);    MULADD(at[7], at[30]);    MULADD(at[8], at[29]);    MULADD(at[9], at[28]);    MULADD(at[10], at[27]);    MULADD(at[11], at[26]);    MULADD(at[12], at[25]);    MULADD(at[13], at[24]);    MULADD(at[14], at[23]);    MULADD(at[15], at[22]);
+   COMBA_STORE(C->dp[21]);
+   /* 22 */
+   COMBA_FORWARD;
+   MULADD(at[7], at[31]);    MULADD(at[8], at[30]);    MULADD(at[9], at[29]);    MULADD(at[10], at[28]);    MULADD(at[11], at[27]);    MULADD(at[12], at[26]);    MULADD(at[13], at[25]);    MULADD(at[14], at[24]);    MULADD(at[15], at[23]);
+   COMBA_STORE(C->dp[22]);
+   /* 23 */
+   COMBA_FORWARD;
+   MULADD(at[8], at[31]);    MULADD(at[9], at[30]);    MULADD(at[10], at[29]);    MULADD(at[11], at[28]);    MULADD(at[12], at[27]);    MULADD(at[13], at[26]);    MULADD(at[14], at[25]);    MULADD(at[15], at[24]);
+   COMBA_STORE(C->dp[23]);
+   /* 24 */
+   COMBA_FORWARD;
+   MULADD(at[9], at[31]);    MULADD(at[10], at[30]);    MULADD(at[11], at[29]);    MULADD(at[12], at[28]);    MULADD(at[13], at[27]);    MULADD(at[14], at[26]);    MULADD(at[15], at[25]);
+   COMBA_STORE(C->dp[24]);
+   /* 25 */
+   COMBA_FORWARD;
+   MULADD(at[10], at[31]);    MULADD(at[11], at[30]);    MULADD(at[12], at[29]);    MULADD(at[13], at[28]);    MULADD(at[14], at[27]);    MULADD(at[15], at[26]);
+   COMBA_STORE(C->dp[25]);
+   /* 26 */
+   COMBA_FORWARD;
+   MULADD(at[11], at[31]);    MULADD(at[12], at[30]);    MULADD(at[13], at[29]);    MULADD(at[14], at[28]);    MULADD(at[15], at[27]);
+   COMBA_STORE(C->dp[26]);
+   /* 27 */
+   COMBA_FORWARD;
+   MULADD(at[12], at[31]);    MULADD(at[13], at[30]);    MULADD(at[14], at[29]);    MULADD(at[15], at[28]);
+   COMBA_STORE(C->dp[27]);
+   /* 28 */
+   COMBA_FORWARD;
+   MULADD(at[13], at[31]);    MULADD(at[14], at[30]);    MULADD(at[15], at[29]);
+   COMBA_STORE(C->dp[28]);
+   /* 29 */
+   COMBA_FORWARD;
+   MULADD(at[14], at[31]);    MULADD(at[15], at[30]);
+   COMBA_STORE(C->dp[29]);
+   /* 30 */
+   COMBA_FORWARD;
+   MULADD(at[15], at[31]);
+   COMBA_STORE(C->dp[30]);
+   COMBA_STORE2(C->dp[31]);
+   C->used = 32;
+   C->sign = A->sign ^ B->sign;
+   pstm_clamp(C);
+   COMBA_FINI;
+   return PSTM_OKAY;
+}
+#endif /* USE_1024_KEY_SPEED_OPTIMIZATIONS */
+
+
+#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
+static int32 pstm_mul_comba32(pstm_int *A, pstm_int *B, pstm_int *C)
+{
+   pstm_digit c0, c1, c2, at[64];
+   int32 out_size;
+
+	if (C->alloc < 64) {
+		if (pstm_grow(C, 64) != PSTM_OKAY) {
+			return PS_MEM_FAIL;
+		}
+	}
+
+   out_size = A->used + B->used;
+   memcpy(at, A->dp, 32 * sizeof(pstm_digit));
+   memcpy(at+32, B->dp, 32 * sizeof(pstm_digit));
+   COMBA_START;
+
+   COMBA_CLEAR;
+   /* 0 */
+   MULADD(at[0], at[32]);
+   COMBA_STORE(C->dp[0]);
+   /* 1 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[33]);    MULADD(at[1], at[32]);
+   COMBA_STORE(C->dp[1]);
+   /* 2 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[34]);    MULADD(at[1], at[33]);    MULADD(at[2], at[32]);
+   COMBA_STORE(C->dp[2]);
+   /* 3 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[35]);    MULADD(at[1], at[34]);    MULADD(at[2], at[33]);    MULADD(at[3], at[32]);
+   COMBA_STORE(C->dp[3]);
+   /* 4 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[36]);    MULADD(at[1], at[35]);    MULADD(at[2], at[34]);    MULADD(at[3], at[33]);    MULADD(at[4], at[32]);
+   COMBA_STORE(C->dp[4]);
+   /* 5 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[37]);    MULADD(at[1], at[36]);    MULADD(at[2], at[35]);    MULADD(at[3], at[34]);    MULADD(at[4], at[33]);    MULADD(at[5], at[32]);
+   COMBA_STORE(C->dp[5]);
+   /* 6 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[38]);    MULADD(at[1], at[37]);    MULADD(at[2], at[36]);    MULADD(at[3], at[35]);    MULADD(at[4], at[34]);    MULADD(at[5], at[33]);    MULADD(at[6], at[32]);
+   COMBA_STORE(C->dp[6]);
+   /* 7 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[39]);    MULADD(at[1], at[38]);    MULADD(at[2], at[37]);    MULADD(at[3], at[36]);    MULADD(at[4], at[35]);    MULADD(at[5], at[34]);    MULADD(at[6], at[33]);    MULADD(at[7], at[32]);
+   COMBA_STORE(C->dp[7]);
+   /* 8 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[40]);    MULADD(at[1], at[39]);    MULADD(at[2], at[38]);    MULADD(at[3], at[37]);    MULADD(at[4], at[36]);    MULADD(at[5], at[35]);    MULADD(at[6], at[34]);    MULADD(at[7], at[33]);    MULADD(at[8], at[32]);
+   COMBA_STORE(C->dp[8]);
+   /* 9 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[41]);    MULADD(at[1], at[40]);    MULADD(at[2], at[39]);    MULADD(at[3], at[38]);    MULADD(at[4], at[37]);    MULADD(at[5], at[36]);    MULADD(at[6], at[35]);    MULADD(at[7], at[34]);    MULADD(at[8], at[33]);    MULADD(at[9], at[32]);
+   COMBA_STORE(C->dp[9]);
+   /* 10 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[42]);    MULADD(at[1], at[41]);    MULADD(at[2], at[40]);    MULADD(at[3], at[39]);    MULADD(at[4], at[38]);    MULADD(at[5], at[37]);    MULADD(at[6], at[36]);    MULADD(at[7], at[35]);    MULADD(at[8], at[34]);    MULADD(at[9], at[33]);    MULADD(at[10], at[32]);
+   COMBA_STORE(C->dp[10]);
+   /* 11 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[43]);    MULADD(at[1], at[42]);    MULADD(at[2], at[41]);    MULADD(at[3], at[40]);    MULADD(at[4], at[39]);    MULADD(at[5], at[38]);    MULADD(at[6], at[37]);    MULADD(at[7], at[36]);    MULADD(at[8], at[35]);    MULADD(at[9], at[34]);    MULADD(at[10], at[33]);    MULADD(at[11], at[32]);
+   COMBA_STORE(C->dp[11]);
+   /* 12 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[44]);    MULADD(at[1], at[43]);    MULADD(at[2], at[42]);    MULADD(at[3], at[41]);    MULADD(at[4], at[40]);    MULADD(at[5], at[39]);    MULADD(at[6], at[38]);    MULADD(at[7], at[37]);    MULADD(at[8], at[36]);    MULADD(at[9], at[35]);    MULADD(at[10], at[34]);    MULADD(at[11], at[33]);    MULADD(at[12], at[32]);
+   COMBA_STORE(C->dp[12]);
+   /* 13 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[45]);    MULADD(at[1], at[44]);    MULADD(at[2], at[43]);    MULADD(at[3], at[42]);    MULADD(at[4], at[41]);    MULADD(at[5], at[40]);    MULADD(at[6], at[39]);    MULADD(at[7], at[38]);    MULADD(at[8], at[37]);    MULADD(at[9], at[36]);    MULADD(at[10], at[35]);    MULADD(at[11], at[34]);    MULADD(at[12], at[33]);    MULADD(at[13], at[32]);
+   COMBA_STORE(C->dp[13]);
+   /* 14 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[46]);    MULADD(at[1], at[45]);    MULADD(at[2], at[44]);    MULADD(at[3], at[43]);    MULADD(at[4], at[42]);    MULADD(at[5], at[41]);    MULADD(at[6], at[40]);    MULADD(at[7], at[39]);    MULADD(at[8], at[38]);    MULADD(at[9], at[37]);    MULADD(at[10], at[36]);    MULADD(at[11], at[35]);    MULADD(at[12], at[34]);    MULADD(at[13], at[33]);    MULADD(at[14], at[32]);
+   COMBA_STORE(C->dp[14]);
+   /* 15 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[47]);    MULADD(at[1], at[46]);    MULADD(at[2], at[45]);    MULADD(at[3], at[44]);    MULADD(at[4], at[43]);    MULADD(at[5], at[42]);    MULADD(at[6], at[41]);    MULADD(at[7], at[40]);    MULADD(at[8], at[39]);    MULADD(at[9], at[38]);    MULADD(at[10], at[37]);    MULADD(at[11], at[36]);    MULADD(at[12], at[35]);    MULADD(at[13], at[34]);    MULADD(at[14], at[33]);    MULADD(at[15], at[32]);
+   COMBA_STORE(C->dp[15]);
+   /* 16 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[48]);    MULADD(at[1], at[47]);    MULADD(at[2], at[46]);    MULADD(at[3], at[45]);    MULADD(at[4], at[44]);    MULADD(at[5], at[43]);    MULADD(at[6], at[42]);    MULADD(at[7], at[41]);    MULADD(at[8], at[40]);    MULADD(at[9], at[39]);    MULADD(at[10], at[38]);    MULADD(at[11], at[37]);    MULADD(at[12], at[36]);    MULADD(at[13], at[35]);    MULADD(at[14], at[34]);    MULADD(at[15], at[33]);    MULADD(at[16], at[32]);
+   COMBA_STORE(C->dp[16]);
+   /* 17 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[49]);    MULADD(at[1], at[48]);    MULADD(at[2], at[47]);    MULADD(at[3], at[46]);    MULADD(at[4], at[45]);    MULADD(at[5], at[44]);    MULADD(at[6], at[43]);    MULADD(at[7], at[42]);    MULADD(at[8], at[41]);    MULADD(at[9], at[40]);    MULADD(at[10], at[39]);    MULADD(at[11], at[38]);    MULADD(at[12], at[37]);    MULADD(at[13], at[36]);    MULADD(at[14], at[35]);    MULADD(at[15], at[34]);    MULADD(at[16], at[33]);    MULADD(at[17], at[32]);
+   COMBA_STORE(C->dp[17]);
+   /* 18 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[50]);    MULADD(at[1], at[49]);    MULADD(at[2], at[48]);    MULADD(at[3], at[47]);    MULADD(at[4], at[46]);    MULADD(at[5], at[45]);    MULADD(at[6], at[44]);    MULADD(at[7], at[43]);    MULADD(at[8], at[42]);    MULADD(at[9], at[41]);    MULADD(at[10], at[40]);    MULADD(at[11], at[39]);    MULADD(at[12], at[38]);    MULADD(at[13], at[37]);    MULADD(at[14], at[36]);    MULADD(at[15], at[35]);    MULADD(at[16], at[34]);    MULADD(at[17], at[33]);    MULADD(at[18], at[32]);
+   COMBA_STORE(C->dp[18]);
+   /* 19 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[51]);    MULADD(at[1], at[50]);    MULADD(at[2], at[49]);    MULADD(at[3], at[48]);    MULADD(at[4], at[47]);    MULADD(at[5], at[46]);    MULADD(at[6], at[45]);    MULADD(at[7], at[44]);    MULADD(at[8], at[43]);    MULADD(at[9], at[42]);    MULADD(at[10], at[41]);    MULADD(at[11], at[40]);    MULADD(at[12], at[39]);    MULADD(at[13], at[38]);    MULADD(at[14], at[37]);    MULADD(at[15], at[36]);    MULADD(at[16], at[35]);    MULADD(at[17], at[34]);    MULADD(at[18], at[33]);    MULADD(at[19], at[32]);
+   COMBA_STORE(C->dp[19]);
+   /* 20 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[52]);    MULADD(at[1], at[51]);    MULADD(at[2], at[50]);    MULADD(at[3], at[49]);    MULADD(at[4], at[48]);    MULADD(at[5], at[47]);    MULADD(at[6], at[46]);    MULADD(at[7], at[45]);    MULADD(at[8], at[44]);    MULADD(at[9], at[43]);    MULADD(at[10], at[42]);    MULADD(at[11], at[41]);    MULADD(at[12], at[40]);    MULADD(at[13], at[39]);    MULADD(at[14], at[38]);    MULADD(at[15], at[37]);    MULADD(at[16], at[36]);    MULADD(at[17], at[35]);    MULADD(at[18], at[34]);    MULADD(at[19], at[33]);    MULADD(at[20], at[32]);
+   COMBA_STORE(C->dp[20]);
+   /* 21 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[53]);    MULADD(at[1], at[52]);    MULADD(at[2], at[51]);    MULADD(at[3], at[50]);    MULADD(at[4], at[49]);    MULADD(at[5], at[48]);    MULADD(at[6], at[47]);    MULADD(at[7], at[46]);    MULADD(at[8], at[45]);    MULADD(at[9], at[44]);    MULADD(at[10], at[43]);    MULADD(at[11], at[42]);    MULADD(at[12], at[41]);    MULADD(at[13], at[40]);    MULADD(at[14], at[39]);    MULADD(at[15], at[38]);    MULADD(at[16], at[37]);    MULADD(at[17], at[36]);    MULADD(at[18], at[35]);    MULADD(at[19], at[34]);    MULADD(at[20], at[33]);    MULADD(at[21], at[32]);
+   COMBA_STORE(C->dp[21]);
+   /* 22 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[54]);    MULADD(at[1], at[53]);    MULADD(at[2], at[52]);    MULADD(at[3], at[51]);    MULADD(at[4], at[50]);    MULADD(at[5], at[49]);    MULADD(at[6], at[48]);    MULADD(at[7], at[47]);    MULADD(at[8], at[46]);    MULADD(at[9], at[45]);    MULADD(at[10], at[44]);    MULADD(at[11], at[43]);    MULADD(at[12], at[42]);    MULADD(at[13], at[41]);    MULADD(at[14], at[40]);    MULADD(at[15], at[39]);    MULADD(at[16], at[38]);    MULADD(at[17], at[37]);    MULADD(at[18], at[36]);    MULADD(at[19], at[35]);    MULADD(at[20], at[34]);    MULADD(at[21], at[33]);    MULADD(at[22], at[32]);
+   COMBA_STORE(C->dp[22]);
+   /* 23 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[55]);    MULADD(at[1], at[54]);    MULADD(at[2], at[53]);    MULADD(at[3], at[52]);    MULADD(at[4], at[51]);    MULADD(at[5], at[50]);    MULADD(at[6], at[49]);    MULADD(at[7], at[48]);    MULADD(at[8], at[47]);    MULADD(at[9], at[46]);    MULADD(at[10], at[45]);    MULADD(at[11], at[44]);    MULADD(at[12], at[43]);    MULADD(at[13], at[42]);    MULADD(at[14], at[41]);    MULADD(at[15], at[40]);    MULADD(at[16], at[39]);    MULADD(at[17], at[38]);    MULADD(at[18], at[37]);    MULADD(at[19], at[36]);    MULADD(at[20], at[35]);    MULADD(at[21], at[34]);    MULADD(at[22], at[33]);    MULADD(at[23], at[32]);
+   COMBA_STORE(C->dp[23]);
+   /* 24 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[56]);    MULADD(at[1], at[55]);    MULADD(at[2], at[54]);    MULADD(at[3], at[53]);    MULADD(at[4], at[52]);    MULADD(at[5], at[51]);    MULADD(at[6], at[50]);    MULADD(at[7], at[49]);    MULADD(at[8], at[48]);    MULADD(at[9], at[47]);    MULADD(at[10], at[46]);    MULADD(at[11], at[45]);    MULADD(at[12], at[44]);    MULADD(at[13], at[43]);    MULADD(at[14], at[42]);    MULADD(at[15], at[41]);    MULADD(at[16], at[40]);    MULADD(at[17], at[39]);    MULADD(at[18], at[38]);    MULADD(at[19], at[37]);    MULADD(at[20], at[36]);    MULADD(at[21], at[35]);    MULADD(at[22], at[34]);    MULADD(at[23], at[33]);    MULADD(at[24], at[32]);
+   COMBA_STORE(C->dp[24]);
+   /* 25 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[57]);    MULADD(at[1], at[56]);    MULADD(at[2], at[55]);    MULADD(at[3], at[54]);    MULADD(at[4], at[53]);    MULADD(at[5], at[52]);    MULADD(at[6], at[51]);    MULADD(at[7], at[50]);    MULADD(at[8], at[49]);    MULADD(at[9], at[48]);    MULADD(at[10], at[47]);    MULADD(at[11], at[46]);    MULADD(at[12], at[45]);    MULADD(at[13], at[44]);    MULADD(at[14], at[43]);    MULADD(at[15], at[42]);    MULADD(at[16], at[41]);    MULADD(at[17], at[40]);    MULADD(at[18], at[39]);    MULADD(at[19], at[38]);    MULADD(at[20], at[37]);    MULADD(at[21], at[36]);    MULADD(at[22], at[35]);    MULADD(at[23], at[34]);    MULADD(at[24], at[33]);    MULADD(at[25], at[32]);
+   COMBA_STORE(C->dp[25]);
+   /* 26 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[58]);    MULADD(at[1], at[57]);    MULADD(at[2], at[56]);    MULADD(at[3], at[55]);    MULADD(at[4], at[54]);    MULADD(at[5], at[53]);    MULADD(at[6], at[52]);    MULADD(at[7], at[51]);    MULADD(at[8], at[50]);    MULADD(at[9], at[49]);    MULADD(at[10], at[48]);    MULADD(at[11], at[47]);    MULADD(at[12], at[46]);    MULADD(at[13], at[45]);    MULADD(at[14], at[44]);    MULADD(at[15], at[43]);    MULADD(at[16], at[42]);    MULADD(at[17], at[41]);    MULADD(at[18], at[40]);    MULADD(at[19], at[39]);    MULADD(at[20], at[38]);    MULADD(at[21], at[37]);    MULADD(at[22], at[36]);    MULADD(at[23], at[35]);    MULADD(at[24], at[34]);    MULADD(at[25], at[33]);    MULADD(at[26], at[32]);
+   COMBA_STORE(C->dp[26]);
+   /* 27 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[59]);    MULADD(at[1], at[58]);    MULADD(at[2], at[57]);    MULADD(at[3], at[56]);    MULADD(at[4], at[55]);    MULADD(at[5], at[54]);    MULADD(at[6], at[53]);    MULADD(at[7], at[52]);    MULADD(at[8], at[51]);    MULADD(at[9], at[50]);    MULADD(at[10], at[49]);    MULADD(at[11], at[48]);    MULADD(at[12], at[47]);    MULADD(at[13], at[46]);    MULADD(at[14], at[45]);    MULADD(at[15], at[44]);    MULADD(at[16], at[43]);    MULADD(at[17], at[42]);    MULADD(at[18], at[41]);    MULADD(at[19], at[40]);    MULADD(at[20], at[39]);    MULADD(at[21], at[38]);    MULADD(at[22], at[37]);    MULADD(at[23], at[36]);    MULADD(at[24], at[35]);    MULADD(at[25], at[34]);    MULADD(at[26], at[33]);    MULADD(at[27], at[32]);
+   COMBA_STORE(C->dp[27]);
+   /* 28 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[60]);    MULADD(at[1], at[59]);    MULADD(at[2], at[58]);    MULADD(at[3], at[57]);    MULADD(at[4], at[56]);    MULADD(at[5], at[55]);    MULADD(at[6], at[54]);    MULADD(at[7], at[53]);    MULADD(at[8], at[52]);    MULADD(at[9], at[51]);    MULADD(at[10], at[50]);    MULADD(at[11], at[49]);    MULADD(at[12], at[48]);    MULADD(at[13], at[47]);    MULADD(at[14], at[46]);    MULADD(at[15], at[45]);    MULADD(at[16], at[44]);    MULADD(at[17], at[43]);    MULADD(at[18], at[42]);    MULADD(at[19], at[41]);    MULADD(at[20], at[40]);    MULADD(at[21], at[39]);    MULADD(at[22], at[38]);    MULADD(at[23], at[37]);    MULADD(at[24], at[36]);    MULADD(at[25], at[35]);    MULADD(at[26], at[34]);    MULADD(at[27], at[33]);    MULADD(at[28], at[32]);
+   COMBA_STORE(C->dp[28]);
+   /* 29 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[61]);    MULADD(at[1], at[60]);    MULADD(at[2], at[59]);    MULADD(at[3], at[58]);    MULADD(at[4], at[57]);    MULADD(at[5], at[56]);    MULADD(at[6], at[55]);    MULADD(at[7], at[54]);    MULADD(at[8], at[53]);    MULADD(at[9], at[52]);    MULADD(at[10], at[51]);    MULADD(at[11], at[50]);    MULADD(at[12], at[49]);    MULADD(at[13], at[48]);    MULADD(at[14], at[47]);    MULADD(at[15], at[46]);    MULADD(at[16], at[45]);    MULADD(at[17], at[44]);    MULADD(at[18], at[43]);    MULADD(at[19], at[42]);    MULADD(at[20], at[41]);    MULADD(at[21], at[40]);    MULADD(at[22], at[39]);    MULADD(at[23], at[38]);    MULADD(at[24], at[37]);    MULADD(at[25], at[36]);    MULADD(at[26], at[35]);    MULADD(at[27], at[34]);    MULADD(at[28], at[33]);    MULADD(at[29], at[32]);
+   COMBA_STORE(C->dp[29]);
+   /* 30 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[62]);    MULADD(at[1], at[61]);    MULADD(at[2], at[60]);    MULADD(at[3], at[59]);    MULADD(at[4], at[58]);    MULADD(at[5], at[57]);    MULADD(at[6], at[56]);    MULADD(at[7], at[55]);    MULADD(at[8], at[54]);    MULADD(at[9], at[53]);    MULADD(at[10], at[52]);    MULADD(at[11], at[51]);    MULADD(at[12], at[50]);    MULADD(at[13], at[49]);    MULADD(at[14], at[48]);    MULADD(at[15], at[47]);    MULADD(at[16], at[46]);    MULADD(at[17], at[45]);    MULADD(at[18], at[44]);    MULADD(at[19], at[43]);    MULADD(at[20], at[42]);    MULADD(at[21], at[41]);    MULADD(at[22], at[40]);    MULADD(at[23], at[39]);    MULADD(at[24], at[38]);    MULADD(at[25], at[37]);    MULADD(at[26], at[36]);    MULADD(at[27], at[35]);    MULADD(at[28], at[34]);    MULADD(at[29], at[33]);    MULADD(at[30], at[32]);
+   COMBA_STORE(C->dp[30]);
+   /* 31 */
+   COMBA_FORWARD;
+   MULADD(at[0], at[63]);    MULADD(at[1], at[62]);    MULADD(at[2], at[61]);    MULADD(at[3], at[60]);    MULADD(at[4], at[59]);    MULADD(at[5], at[58]);    MULADD(at[6], at[57]);    MULADD(at[7], at[56]);    MULADD(at[8], at[55]);    MULADD(at[9], at[54]);    MULADD(at[10], at[53]);    MULADD(at[11], at[52]);    MULADD(at[12], at[51]);    MULADD(at[13], at[50]);    MULADD(at[14], at[49]);    MULADD(at[15], at[48]);    MULADD(at[16], at[47]);    MULADD(at[17], at[46]);    MULADD(at[18], at[45]);    MULADD(at[19], at[44]);    MULADD(at[20], at[43]);    MULADD(at[21], at[42]);    MULADD(at[22], at[41]);    MULADD(at[23], at[40]);    MULADD(at[24], at[39]);    MULADD(at[25], at[38]);    MULADD(at[26], at[37]);    MULADD(at[27], at[36]);    MULADD(at[28], at[35]);    MULADD(at[29], at[34]);    MULADD(at[30], at[33]);    MULADD(at[31], at[32]);
+   COMBA_STORE(C->dp[31]);
+   /* 32 */
+   COMBA_FORWARD;
+   MULADD(at[1], at[63]);    MULADD(at[2], at[62]);    MULADD(at[3], at[61]);    MULADD(at[4], at[60]);    MULADD(at[5], at[59]);    MULADD(at[6], at[58]);    MULADD(at[7], at[57]);    MULADD(at[8], at[56]);    MULADD(at[9], at[55]);    MULADD(at[10], at[54]);    MULADD(at[11], at[53]);    MULADD(at[12], at[52]);    MULADD(at[13], at[51]);    MULADD(at[14], at[50]);    MULADD(at[15], at[49]);    MULADD(at[16], at[48]);    MULADD(at[17], at[47]);    MULADD(at[18], at[46]);    MULADD(at[19], at[45]);    MULADD(at[20], at[44]);    MULADD(at[21], at[43]);    MULADD(at[22], at[42]);    MULADD(at[23], at[41]);    MULADD(at[24], at[40]);    MULADD(at[25], at[39]);    MULADD(at[26], at[38]);    MULADD(at[27], at[37]);    MULADD(at[28], at[36]);    MULADD(at[29], at[35]);    MULADD(at[30], at[34]);    MULADD(at[31], at[33]);
+   COMBA_STORE(C->dp[32]);
+   /* 33 */
+   COMBA_FORWARD;
+   MULADD(at[2], at[63]);    MULADD(at[3], at[62]);    MULADD(at[4], at[61]);    MULADD(at[5], at[60]);    MULADD(at[6], at[59]);    MULADD(at[7], at[58]);    MULADD(at[8], at[57]);    MULADD(at[9], at[56]);    MULADD(at[10], at[55]);    MULADD(at[11], at[54]);    MULADD(at[12], at[53]);    MULADD(at[13], at[52]);    MULADD(at[14], at[51]);    MULADD(at[15], at[50]);    MULADD(at[16], at[49]);    MULADD(at[17], at[48]);    MULADD(at[18], at[47]);    MULADD(at[19], at[46]);    MULADD(at[20], at[45]);    MULADD(at[21], at[44]);    MULADD(at[22], at[43]);    MULADD(at[23], at[42]);    MULADD(at[24], at[41]);    MULADD(at[25], at[40]);    MULADD(at[26], at[39]);    MULADD(at[27], at[38]);    MULADD(at[28], at[37]);    MULADD(at[29], at[36]);    MULADD(at[30], at[35]);    MULADD(at[31], at[34]);
+   COMBA_STORE(C->dp[33]);
+   /* 34 */
+   COMBA_FORWARD;
+   MULADD(at[3], at[63]);    MULADD(at[4], at[62]);    MULADD(at[5], at[61]);    MULADD(at[6], at[60]);    MULADD(at[7], at[59]);    MULADD(at[8], at[58]);    MULADD(at[9], at[57]);    MULADD(at[10], at[56]);    MULADD(at[11], at[55]);    MULADD(at[12], at[54]);    MULADD(at[13], at[53]);    MULADD(at[14], at[52]);    MULADD(at[15], at[51]);    MULADD(at[16], at[50]);    MULADD(at[17], at[49]);    MULADD(at[18], at[48]);    MULADD(at[19], at[47]);    MULADD(at[20], at[46]);    MULADD(at[21], at[45]);    MULADD(at[22], at[44]);    MULADD(at[23], at[43]);    MULADD(at[24], at[42]);    MULADD(at[25], at[41]);    MULADD(at[26], at[40]);    MULADD(at[27], at[39]);    MULADD(at[28], at[38]);    MULADD(at[29], at[37]);    MULADD(at[30], at[36]);    MULADD(at[31], at[35]);
+   COMBA_STORE(C->dp[34]);
+   /* 35 */
+   COMBA_FORWARD;
+   MULADD(at[4], at[63]);    MULADD(at[5], at[62]);    MULADD(at[6], at[61]);    MULADD(at[7], at[60]);    MULADD(at[8], at[59]);    MULADD(at[9], at[58]);    MULADD(at[10], at[57]);    MULADD(at[11], at[56]);    MULADD(at[12], at[55]);    MULADD(at[13], at[54]);    MULADD(at[14], at[53]);    MULADD(at[15], at[52]);    MULADD(at[16], at[51]);    MULADD(at[17], at[50]);    MULADD(at[18], at[49]);    MULADD(at[19], at[48]);    MULADD(at[20], at[47]);    MULADD(at[21], at[46]);    MULADD(at[22], at[45]);    MULADD(at[23], at[44]);    MULADD(at[24], at[43]);    MULADD(at[25], at[42]);    MULADD(at[26], at[41]);    MULADD(at[27], at[40]);    MULADD(at[28], at[39]);    MULADD(at[29], at[38]);    MULADD(at[30], at[37]);    MULADD(at[31], at[36]);
+   COMBA_STORE(C->dp[35]);
+   /* 36 */
+   COMBA_FORWARD;
+   MULADD(at[5], at[63]);    MULADD(at[6], at[62]);    MULADD(at[7], at[61]);    MULADD(at[8], at[60]);    MULADD(at[9], at[59]);    MULADD(at[10], at[58]);    MULADD(at[11], at[57]);    MULADD(at[12], at[56]);    MULADD(at[13], at[55]);    MULADD(at[14], at[54]);    MULADD(at[15], at[53]);    MULADD(at[16], at[52]);    MULADD(at[17], at[51]);    MULADD(at[18], at[50]);    MULADD(at[19], at[49]);    MULADD(at[20], at[48]);    MULADD(at[21], at[47]);    MULADD(at[22], at[46]);    MULADD(at[23], at[45]);    MULADD(at[24], at[44]);    MULADD(at[25], at[43]);    MULADD(at[26], at[42]);    MULADD(at[27], at[41]);    MULADD(at[28], at[40]);    MULADD(at[29], at[39]);    MULADD(at[30], at[38]);    MULADD(at[31], at[37]);
+   COMBA_STORE(C->dp[36]);
+   /* 37 */
+   COMBA_FORWARD;
+   MULADD(at[6], at[63]);    MULADD(at[7], at[62]);    MULADD(at[8], at[61]);    MULADD(at[9], at[60]);    MULADD(at[10], at[59]);    MULADD(at[11], at[58]);    MULADD(at[12], at[57]);    MULADD(at[13], at[56]);    MULADD(at[14], at[55]);    MULADD(at[15], at[54]);    MULADD(at[16], at[53]);    MULADD(at[17], at[52]);    MULADD(at[18], at[51]);    MULADD(at[19], at[50]);    MULADD(at[20], at[49]);    MULADD(at[21], at[48]);    MULADD(at[22], at[47]);    MULADD(at[23], at[46]);    MULADD(at[24], at[45]);    MULADD(at[25], at[44]);    MULADD(at[26], at[43]);    MULADD(at[27], at[42]);    MULADD(at[28], at[41]);    MULADD(at[29], at[40]);    MULADD(at[30], at[39]);    MULADD(at[31], at[38]);
+   COMBA_STORE(C->dp[37]);
+   /* 38 */
+   COMBA_FORWARD;
+   MULADD(at[7], at[63]);    MULADD(at[8], at[62]);    MULADD(at[9], at[61]);    MULADD(at[10], at[60]);    MULADD(at[11], at[59]);    MULADD(at[12], at[58]);    MULADD(at[13], at[57]);    MULADD(at[14], at[56]);    MULADD(at[15], at[55]);    MULADD(at[16], at[54]);    MULADD(at[17], at[53]);    MULADD(at[18], at[52]);    MULADD(at[19], at[51]);    MULADD(at[20], at[50]);    MULADD(at[21], at[49]);    MULADD(at[22], at[48]);    MULADD(at[23], at[47]);    MULADD(at[24], at[46]);    MULADD(at[25], at[45]);    MULADD(at[26], at[44]);    MULADD(at[27], at[43]);    MULADD(at[28], at[42]);    MULADD(at[29], at[41]);    MULADD(at[30], at[40]);    MULADD(at[31], at[39]);
+   COMBA_STORE(C->dp[38]);
+
+   /* early out at 40 digits, 40*32==1280, or two 640 bit operands */
+   if (out_size <= 40) { COMBA_STORE2(C->dp[39]); C->used = 40; C->sign = A->sign ^ B->sign; pstm_clamp(C); COMBA_FINI; return PSTM_OKAY; }
+
+   /* 39 */
+   COMBA_FORWARD;
+   MULADD(at[8], at[63]);    MULADD(at[9], at[62]);    MULADD(at[10], at[61]);    MULADD(at[11], at[60]);    MULADD(at[12], at[59]);    MULADD(at[13], at[58]);    MULADD(at[14], at[57]);    MULADD(at[15], at[56]);    MULADD(at[16], at[55]);    MULADD(at[17], at[54]);    MULADD(at[18], at[53]);    MULADD(at[19], at[52]);    MULADD(at[20], at[51]);    MULADD(at[21], at[50]);    MULADD(at[22], at[49]);    MULADD(at[23], at[48]);    MULADD(at[24], at[47]);    MULADD(at[25], at[46]);    MULADD(at[26], at[45]);    MULADD(at[27], at[44]);    MULADD(at[28], at[43]);    MULADD(at[29], at[42]);    MULADD(at[30], at[41]);    MULADD(at[31], at[40]);
+   COMBA_STORE(C->dp[39]);
+   /* 40 */
+   COMBA_FORWARD;
+   MULADD(at[9], at[63]);    MULADD(at[10], at[62]);    MULADD(at[11], at[61]);    MULADD(at[12], at[60]);    MULADD(at[13], at[59]);    MULADD(at[14], at[58]);    MULADD(at[15], at[57]);    MULADD(at[16], at[56]);    MULADD(at[17], at[55]);    MULADD(at[18], at[54]);    MULADD(at[19], at[53]);    MULADD(at[20], at[52]);    MULADD(at[21], at[51]);    MULADD(at[22], at[50]);    MULADD(at[23], at[49]);    MULADD(at[24], at[48]);    MULADD(at[25], at[47]);    MULADD(at[26], at[46]);    MULADD(at[27], at[45]);    MULADD(at[28], at[44]);    MULADD(at[29], at[43]);    MULADD(at[30], at[42]);    MULADD(at[31], at[41]);
+   COMBA_STORE(C->dp[40]);
+   /* 41 */
+   COMBA_FORWARD;
+   MULADD(at[10], at[63]);    MULADD(at[11], at[62]);    MULADD(at[12], at[61]);    MULADD(at[13], at[60]);    MULADD(at[14], at[59]);    MULADD(at[15], at[58]);    MULADD(at[16], at[57]);    MULADD(at[17], at[56]);    MULADD(at[18], at[55]);    MULADD(at[19], at[54]);    MULADD(at[20], at[53]);    MULADD(at[21], at[52]);    MULADD(at[22], at[51]);    MULADD(at[23], at[50]);    MULADD(at[24], at[49]);    MULADD(at[25], at[48]);    MULADD(at[26], at[47]);    MULADD(at[27], at[46]);    MULADD(at[28], at[45]);    MULADD(at[29], at[44]);    MULADD(at[30], at[43]);    MULADD(at[31], at[42]);
+   COMBA_STORE(C->dp[41]);
+   /* 42 */
+   COMBA_FORWARD;
+   MULADD(at[11], at[63]);    MULADD(at[12], at[62]);    MULADD(at[13], at[61]);    MULADD(at[14], at[60]);    MULADD(at[15], at[59]);    MULADD(at[16], at[58]);    MULADD(at[17], at[57]);    MULADD(at[18], at[56]);    MULADD(at[19], at[55]);    MULADD(at[20], at[54]);    MULADD(at[21], at[53]);    MULADD(at[22], at[52]);    MULADD(at[23], at[51]);    MULADD(at[24], at[50]);    MULADD(at[25], at[49]);    MULADD(at[26], at[48]);    MULADD(at[27], at[47]);    MULADD(at[28], at[46]);    MULADD(at[29], at[45]);    MULADD(at[30], at[44]);    MULADD(at[31], at[43]);
+   COMBA_STORE(C->dp[42]);
+   /* 43 */
+   COMBA_FORWARD;
+   MULADD(at[12], at[63]);    MULADD(at[13], at[62]);    MULADD(at[14], at[61]);    MULADD(at[15], at[60]);    MULADD(at[16], at[59]);    MULADD(at[17], at[58]);    MULADD(at[18], at[57]);    MULADD(at[19], at[56]);    MULADD(at[20], at[55]);    MULADD(at[21], at[54]);    MULADD(at[22], at[53]);    MULADD(at[23], at[52]);    MULADD(at[24], at[51]);    MULADD(at[25], at[50]);    MULADD(at[26], at[49]);    MULADD(at[27], at[48]);    MULADD(at[28], at[47]);    MULADD(at[29], at[46]);    MULADD(at[30], at[45]);    MULADD(at[31], at[44]);
+   COMBA_STORE(C->dp[43]);
+   /* 44 */
+   COMBA_FORWARD;
+   MULADD(at[13], at[63]);    MULADD(at[14], at[62]);    MULADD(at[15], at[61]);    MULADD(at[16], at[60]);    MULADD(at[17], at[59]);    MULADD(at[18], at[58]);    MULADD(at[19], at[57]);    MULADD(at[20], at[56]);    MULADD(at[21], at[55]);    MULADD(at[22], at[54]);    MULADD(at[23], at[53]);    MULADD(at[24], at[52]);    MULADD(at[25], at[51]);    MULADD(at[26], at[50]);    MULADD(at[27], at[49]);    MULADD(at[28], at[48]);    MULADD(at[29], at[47]);    MULADD(at[30], at[46]);    MULADD(at[31], at[45]);
+   COMBA_STORE(C->dp[44]);
+   /* 45 */
+   COMBA_FORWARD;
+   MULADD(at[14], at[63]);    MULADD(at[15], at[62]);    MULADD(at[16], at[61]);    MULADD(at[17], at[60]);    MULADD(at[18], at[59]);    MULADD(at[19], at[58]);    MULADD(at[20], at[57]);    MULADD(at[21], at[56]);    MULADD(at[22], at[55]);    MULADD(at[23], at[54]);    MULADD(at[24], at[53]);    MULADD(at[25], at[52]);    MULADD(at[26], at[51]);    MULADD(at[27], at[50]);    MULADD(at[28], at[49]);    MULADD(at[29], at[48]);    MULADD(at[30], at[47]);    MULADD(at[31], at[46]);
+   COMBA_STORE(C->dp[45]);
+   /* 46 */
+   COMBA_FORWARD;
+   MULADD(at[15], at[63]);    MULADD(at[16], at[62]);    MULADD(at[17], at[61]);    MULADD(at[18], at[60]);    MULADD(at[19], at[59]);    MULADD(at[20], at[58]);    MULADD(at[21], at[57]);    MULADD(at[22], at[56]);    MULADD(at[23], at[55]);    MULADD(at[24], at[54]);    MULADD(at[25], at[53]);    MULADD(at[26], at[52]);    MULADD(at[27], at[51]);    MULADD(at[28], at[50]);    MULADD(at[29], at[49]);    MULADD(at[30], at[48]);    MULADD(at[31], at[47]);
+   COMBA_STORE(C->dp[46]);
+
+   /* early out at 48 digits, 48*32==1536, or two 768 bit operands */
+   if (out_size <= 48) { COMBA_STORE2(C->dp[47]); C->used = 48; C->sign = A->sign ^ B->sign; pstm_clamp(C); COMBA_FINI; return PSTM_OKAY; }
+
+   /* 47 */
+   COMBA_FORWARD;
+   MULADD(at[16], at[63]);    MULADD(at[17], at[62]);    MULADD(at[18], at[61]);    MULADD(at[19], at[60]);    MULADD(at[20], at[59]);    MULADD(at[21], at[58]);    MULADD(at[22], at[57]);    MULADD(at[23], at[56]);    MULADD(at[24], at[55]);    MULADD(at[25], at[54]);    MULADD(at[26], at[53]);    MULADD(at[27], at[52]);    MULADD(at[28], at[51]);    MULADD(at[29], at[50]);    MULADD(at[30], at[49]);    MULADD(at[31], at[48]);
+   COMBA_STORE(C->dp[47]);
+   /* 48 */
+   COMBA_FORWARD;
+   MULADD(at[17], at[63]);    MULADD(at[18], at[62]);    MULADD(at[19], at[61]);    MULADD(at[20], at[60]);    MULADD(at[21], at[59]);    MULADD(at[22], at[58]);    MULADD(at[23], at[57]);    MULADD(at[24], at[56]);    MULADD(at[25], at[55]);    MULADD(at[26], at[54]);    MULADD(at[27], at[53]);    MULADD(at[28], at[52]);    MULADD(at[29], at[51]);    MULADD(at[30], at[50]);    MULADD(at[31], at[49]);
+   COMBA_STORE(C->dp[48]);
+   /* 49 */
+   COMBA_FORWARD;
+   MULADD(at[18], at[63]);    MULADD(at[19], at[62]);    MULADD(at[20], at[61]);    MULADD(at[21], at[60]);    MULADD(at[22], at[59]);    MULADD(at[23], at[58]);    MULADD(at[24], at[57]);    MULADD(at[25], at[56]);    MULADD(at[26], at[55]);    MULADD(at[27], at[54]);    MULADD(at[28], at[53]);    MULADD(at[29], at[52]);    MULADD(at[30], at[51]);    MULADD(at[31], at[50]);
+   COMBA_STORE(C->dp[49]);
+   /* 50 */
+   COMBA_FORWARD;
+   MULADD(at[19], at[63]);    MULADD(at[20], at[62]);    MULADD(at[21], at[61]);    MULADD(at[22], at[60]);    MULADD(at[23], at[59]);    MULADD(at[24], at[58]);    MULADD(at[25], at[57]);    MULADD(at[26], at[56]);    MULADD(at[27], at[55]);    MULADD(at[28], at[54]);    MULADD(at[29], at[53]);    MULADD(at[30], at[52]);    MULADD(at[31], at[51]);
+   COMBA_STORE(C->dp[50]);
+   /* 51 */
+   COMBA_FORWARD;
+   MULADD(at[20], at[63]);    MULADD(at[21], at[62]);    MULADD(at[22], at[61]);    MULADD(at[23], at[60]);    MULADD(at[24], at[59]);    MULADD(at[25], at[58]);    MULADD(at[26], at[57]);    MULADD(at[27], at[56]);    MULADD(at[28], at[55]);    MULADD(at[29], at[54]);    MULADD(at[30], at[53]);    MULADD(at[31], at[52]);
+   COMBA_STORE(C->dp[51]);
+   /* 52 */
+   COMBA_FORWARD;
+   MULADD(at[21], at[63]);    MULADD(at[22], at[62]);    MULADD(at[23], at[61]);    MULADD(at[24], at[60]);    MULADD(at[25], at[59]);    MULADD(at[26], at[58]);    MULADD(at[27], at[57]);    MULADD(at[28], at[56]);    MULADD(at[29], at[55]);    MULADD(at[30], at[54]);    MULADD(at[31], at[53]);
+   COMBA_STORE(C->dp[52]);
+   /* 53 */
+   COMBA_FORWARD;
+   MULADD(at[22], at[63]);    MULADD(at[23], at[62]);    MULADD(at[24], at[61]);    MULADD(at[25], at[60]);    MULADD(at[26], at[59]);    MULADD(at[27], at[58]);    MULADD(at[28], at[57]);    MULADD(at[29], at[56]);    MULADD(at[30], at[55]);    MULADD(at[31], at[54]);
+   COMBA_STORE(C->dp[53]);
+   /* 54 */
+   COMBA_FORWARD;
+   MULADD(at[23], at[63]);    MULADD(at[24], at[62]);    MULADD(at[25], at[61]);    MULADD(at[26], at[60]);    MULADD(at[27], at[59]);    MULADD(at[28], at[58]);    MULADD(at[29], at[57]);    MULADD(at[30], at[56]);    MULADD(at[31], at[55]);
+   COMBA_STORE(C->dp[54]);
+
+   /* early out at 56 digits, 56*32==1792, or two 896 bit operands */
+   if (out_size <= 56) { COMBA_STORE2(C->dp[55]); C->used = 56; C->sign = A->sign ^ B->sign; pstm_clamp(C); COMBA_FINI; return PSTM_OKAY; }
+
+   /* 55 */
+   COMBA_FORWARD;
+   MULADD(at[24], at[63]);    MULADD(at[25], at[62]);    MULADD(at[26], at[61]);    MULADD(at[27], at[60]);    MULADD(at[28], at[59]);    MULADD(at[29], at[58]);    MULADD(at[30], at[57]);    MULADD(at[31], at[56]);
+   COMBA_STORE(C->dp[55]);
+   /* 56 */
+   COMBA_FORWARD;
+   MULADD(at[25], at[63]);    MULADD(at[26], at[62]);    MULADD(at[27], at[61]);    MULADD(at[28], at[60]);    MULADD(at[29], at[59]);    MULADD(at[30], at[58]);    MULADD(at[31], at[57]);
+   COMBA_STORE(C->dp[56]);
+   /* 57 */
+   COMBA_FORWARD;
+   MULADD(at[26], at[63]);    MULADD(at[27], at[62]);    MULADD(at[28], at[61]);    MULADD(at[29], at[60]);    MULADD(at[30], at[59]);    MULADD(at[31], at[58]);
+   COMBA_STORE(C->dp[57]);
+   /* 58 */
+   COMBA_FORWARD;
+   MULADD(at[27], at[63]);    MULADD(at[28], at[62]);    MULADD(at[29], at[61]);    MULADD(at[30], at[60]);    MULADD(at[31], at[59]);
+   COMBA_STORE(C->dp[58]);
+   /* 59 */
+   COMBA_FORWARD;
+   MULADD(at[28], at[63]);    MULADD(at[29], at[62]);    MULADD(at[30], at[61]);    MULADD(at[31], at[60]);
+   COMBA_STORE(C->dp[59]);
+   /* 60 */
+   COMBA_FORWARD;
+   MULADD(at[29], at[63]);    MULADD(at[30], at[62]);    MULADD(at[31], at[61]);
+   COMBA_STORE(C->dp[60]);
+   /* 61 */
+   COMBA_FORWARD;
+   MULADD(at[30], at[63]);    MULADD(at[31], at[62]);
+   COMBA_STORE(C->dp[61]);
+   /* 62 */
+   COMBA_FORWARD;
+   MULADD(at[31], at[63]);
+   COMBA_STORE(C->dp[62]);
+   COMBA_STORE2(C->dp[63]);
+   C->used = 64;
+   C->sign = A->sign ^ B->sign;
+   pstm_clamp(C);
+   COMBA_FINI;
+	return PSTM_OKAY;
+}
+#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
+
+/******************************************************************************/
+
+int32 pstm_mul_comba(psPool_t *pool, pstm_int *A, pstm_int *B, pstm_int *C,
+			pstm_digit *paD, uint32 paDlen)
+{
+#ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS
+	if (A->used == 16 && B->used == 16) {
+		return pstm_mul_comba16(A, B, C);
+	} else {
+#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
+		if (A->used == 32 && B->used == 32) {
+			return pstm_mul_comba32(A, B, C);
+		}
+#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
+		return pstm_mul_comba_gen(pool, A, B, C, paD, paDlen);
+	}
+#else
+#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
+	if (A->used == 32 && B->used == 32) {
+		return pstm_mul_comba32(A, B, C);
+	}
+#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
+	return pstm_mul_comba_gen(pool, A, B, C, paD, paDlen);
+#endif
+}
+
+#endif /* !DISABLE_PSTM */
+/******************************************************************************/
diff --git a/networking/tls_pstm_sqr_comba.c b/networking/tls_pstm_sqr_comba.c
new file mode 100644
index 0000000..98186d3
--- /dev/null
+++ b/networking/tls_pstm_sqr_comba.c
@@ -0,0 +1,1107 @@
+/*
+ * Copyright (C) 2017 Denys Vlasenko
+ *
+ * Licensed under GPLv2, see file LICENSE in this source tree.
+ */
+#include "tls.h"
+
+/**
+ *	@file    pstm_sqr_comba.c
+ *	@version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
+ *
+ *	Multiprecision Squaring with Comba technique.
+ */
+/*
+ *	Copyright (c) 2013-2015 INSIDE Secure Corporation
+ *	Copyright (c) PeerSec Networks, 2002-2011
+ *	All Rights Reserved
+ *
+ *	The latest version of this code is available at http://www.matrixssl.org
+ *
+ *	This software is open source; you can redistribute it and/or modify
+ *	it under the terms of the GNU General Public License as published by
+ *	the Free Software Foundation; either version 2 of the License, or
+ *	(at your option) any later version.
+ *
+ *	This General Public License does NOT permit incorporating this software
+ *	into proprietary programs.  If you are unable to comply with the GPL, a
+ *	commercial license for this software may be purchased from INSIDE at
+ *	http://www.insidesecure.com/eng/Company/Locations
+ *
+ *	This program is distributed in WITHOUT ANY WARRANTY; without even the
+ *	implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *	See the GNU General Public License for more details.
+ *
+ *	You should have received a copy of the GNU General Public License
+ *	along with this program; if not, write to the Free Software
+ *	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *	http://www.gnu.org/copyleft/gpl.html
+ */
+/******************************************************************************/
+
+///bbox
+//#include "../cryptoApi.h"
+#ifndef DISABLE_PSTM
+
+/******************************************************************************/
+#if defined(PSTM_X86)
+/* x86-32 optimized for 32 bit platforms. For 64 bit mode use X86_64 instead */
+#if !defined(__GNUC__) || !defined(__i386__)
+#error "PSTM_X86 option requires GCC and 32 bit mode x86 processor"
+#endif
+//#pragma message ("Using 32 bit x86 Assembly Optimizations")
+
+#define COMBA_START
+
+#define CLEAR_CARRY \
+   c0 = c1 = c2 = 0;
+
+#define COMBA_STORE(x) \
+   x = c0;
+
+#define COMBA_STORE2(x) \
+   x = c1;
+
+#define CARRY_FORWARD \
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+#define COMBA_FINI
+
+#define SQRADD(i, j)                                      \
+asm(                                            \
+	 "movl  %6,%%eax     \n\t"                            \
+	 "mull  %%eax        \n\t"                            \
+	 "addl  %%eax,%0     \n\t"                            \
+	 "adcl  %%edx,%1     \n\t"                            \
+	 "adcl  $0,%2        \n\t"                            \
+	 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc");
+
+#define SQRADD2(i, j)                                     \
+asm(                                            \
+	 "movl  %6,%%eax     \n\t"                            \
+	 "mull  %7           \n\t"                            \
+	 "addl  %%eax,%0     \n\t"                            \
+	 "adcl  %%edx,%1     \n\t"                            \
+	 "adcl  $0,%2        \n\t"                            \
+	 "addl  %%eax,%0     \n\t"                            \
+	 "adcl  %%edx,%1     \n\t"                            \
+	 "adcl  $0,%2        \n\t"                            \
+	 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","%cc");
+
+#define SQRADDSC(i, j)                                    \
+asm(                                                     \
+	 "movl  %6,%%eax     \n\t"                            \
+	 "mull  %7           \n\t"                            \
+	 "movl  %%eax,%0     \n\t"                            \
+	 "movl  %%edx,%1     \n\t"                            \
+	 "xorl  %2,%2        \n\t"                            \
+	 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");
+
+#define SQRADDAC(i, j)                                    \
+asm(                                                     \
+	 "movl  %6,%%eax     \n\t"                            \
+	 "mull  %7           \n\t"                            \
+	 "addl  %%eax,%0     \n\t"                            \
+	 "adcl  %%edx,%1     \n\t"                            \
+	 "adcl  $0,%2        \n\t"                            \
+	 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");
+
+#define SQRADDDB                                          \
+asm(                                                     \
+	 "addl %6,%0         \n\t"                            \
+	 "adcl %7,%1         \n\t"                            \
+	 "adcl %8,%2         \n\t"                            \
+	 "addl %6,%0         \n\t"                            \
+	 "adcl %7,%1         \n\t"                            \
+	 "adcl %8,%2         \n\t"                            \
+	 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "%cc");
+
+/******************************************************************************/
+#elif defined(PSTM_X86_64)
+/* x86-64 optimized */
+#if !defined(__GNUC__) || !defined(__x86_64__) || !defined(PSTM_64BIT)
+#error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor"
+#endif
+//#pragma message ("Using 64 bit x86_64 Assembly Optimizations")
+
+#define COMBA_START
+
+#define CLEAR_CARRY \
+c0 = c1 = c2 = 0;
+
+#define COMBA_STORE(x) \
+x = c0;
+
+#define COMBA_STORE2(x) \
+x = c1;
+
+#define CARRY_FORWARD \
+do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+#define COMBA_FINI
+
+#define SQRADD(i, j)                                     \
+asm(                                                     \
+	"movq  %6,%%rax     \n\t"                            \
+	"mulq  %%rax        \n\t"                            \
+	"addq  %%rax,%0     \n\t"                            \
+	"adcq  %%rdx,%1     \n\t"                            \
+	"adcq  $0,%2        \n\t"                            \
+	:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","cc");
+
+#define SQRADD2(i, j)                                    \
+asm(                                                     \
+	"movq  %6,%%rax     \n\t"                            \
+	"mulq  %7           \n\t"                            \
+	"addq  %%rax,%0     \n\t"                            \
+	"adcq  %%rdx,%1     \n\t"                            \
+	"adcq  $0,%2        \n\t"                            \
+	"addq  %%rax,%0     \n\t"                            \
+	"adcq  %%rdx,%1     \n\t"                            \
+	"adcq  $0,%2        \n\t"                            \
+	:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","cc");
+
+#define SQRADDSC(i, j)                                   \
+asm(                                                     \
+	"movq  %6,%%rax     \n\t"                            \
+	"mulq  %7           \n\t"                            \
+	"movq  %%rax,%0     \n\t"                            \
+	"movq  %%rdx,%1     \n\t"                            \
+	"xorq  %2,%2        \n\t"                            \
+	:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
+
+#define SQRADDAC(i, j)                                   \
+asm(                                                     \
+	"movq  %6,%%rax     \n\t"                            \
+	"mulq  %7           \n\t"                            \
+	"addq  %%rax,%0     \n\t"                            \
+	"adcq  %%rdx,%1     \n\t"                            \
+	"adcq  $0,%2        \n\t"                            \
+	:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
+
+#define SQRADDDB                                         \
+asm(                                                     \
+	"addq %6,%0         \n\t"                            \
+	"adcq %7,%1         \n\t"                            \
+	"adcq %8,%2         \n\t"                            \
+	"addq %6,%0         \n\t"                            \
+	"adcq %7,%1         \n\t"                            \
+	"adcq %8,%2         \n\t"                            \
+	:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
+
+/******************************************************************************/
+#elif defined(PSTM_ARM)
+/* ARM code */
+//#pragma message ("Using 32 bit ARM Assembly Optimizations")
+
+#define COMBA_START
+
+#define CLEAR_CARRY \
+c0 = c1 = c2 = 0;
+
+#define COMBA_STORE(x) \
+x = c0;
+
+#define COMBA_STORE2(x) \
+x = c1;
+
+#define CARRY_FORWARD \
+do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+#define COMBA_FINI
+
+/* multiplies point i and j, updates carry "c1" and digit c2 */
+#define SQRADD(i, j)                                             \
+asm(                                                             \
+"  UMULL  r0,r1,%6,%6              \n\t"                         \
+"  ADDS   %0,%0,r0                 \n\t"                         \
+"  ADCS   %1,%1,r1                 \n\t"                         \
+"  ADC    %2,%2,#0                 \n\t"                         \
+:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "%cc");
+
+/* for squaring some of the terms are doubled... */
+#define SQRADD2(i, j)                                            \
+asm(                                                             \
+"  UMULL  r0,r1,%6,%7              \n\t"                         \
+"  ADDS   %0,%0,r0                 \n\t"                         \
+"  ADCS   %1,%1,r1                 \n\t"                         \
+"  ADC    %2,%2,#0                 \n\t"                         \
+"  ADDS   %0,%0,r0                 \n\t"                         \
+"  ADCS   %1,%1,r1                 \n\t"                         \
+"  ADC    %2,%2,#0                 \n\t"                         \
+:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "%cc");
+
+#define SQRADDSC(i, j)                                           \
+asm(                                                             \
+"  UMULL  %0,%1,%6,%7              \n\t"                         \
+"  SUB    %2,%2,%2                 \n\t"                         \
+:"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "%cc");
+
+#define SQRADDAC(i, j)                                           \
+asm(                                                             \
+"  UMULL  r0,r1,%6,%7              \n\t"                         \
+"  ADDS   %0,%0,r0                 \n\t"                         \
+"  ADCS   %1,%1,r1                 \n\t"                         \
+"  ADC    %2,%2,#0                 \n\t"                         \
+:"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "%cc");
+
+#define SQRADDDB                                                 \
+asm(                                                             \
+"  ADDS  %0,%0,%3                     \n\t"                      \
+"  ADCS  %1,%1,%4                     \n\t"                      \
+"  ADC   %2,%2,%5                     \n\t"                      \
+"  ADDS  %0,%0,%3                     \n\t"                      \
+"  ADCS  %1,%1,%4                     \n\t"                      \
+"  ADC   %2,%2,%5                     \n\t"                      \
+:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "%cc");
+
+/******************************************************************************/
+#elif defined(PSTM_MIPS)
+/* MIPS32 */
+//#pragma message ("Using 32 bit MIPS Assembly Optimizations")
+
+#define COMBA_START
+
+#define CLEAR_CARRY \
+c0 = c1 = c2 = 0;
+
+#define COMBA_STORE(x) \
+x = c0;
+
+#define COMBA_STORE2(x) \
+x = c1;
+
+#define CARRY_FORWARD \
+do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+#define COMBA_FINI
+
+/* multiplies point i and j, updates carry "c1" and digit c2 */
+#define SQRADD(i, j)               \
+asm(                               \
+	" multu  %6,%6          \n\t"  \
+	" mflo   $12            \n\t"  \
+	" mfhi   $13            \n\t"  \
+	" addu    %0,%0,$12     \n\t"  \
+	" sltu   $12,%0,$12     \n\t"  \
+	" addu    %1,%1,$13     \n\t"  \
+	" sltu   $13,%1,$13     \n\t"  \
+	" addu    %1,%1,$12     \n\t"  \
+	" sltu   $12,%1,$12     \n\t"  \
+	" addu    %2,%2,$13     \n\t"  \
+	" addu    %2,%2,$12     \n\t"  \
+	:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"$12","$13");
+
+/* for squaring some of the terms are doubled... */
+#define SQRADD2(i, j)             \
+asm(                              \
+	" multu  %6,%7          \n\t" \
+	" mflo   $12            \n\t" \
+	" mfhi   $13            \n\t" \
+	\
+	" addu    %0,%0,$12     \n\t" \
+	" sltu   $14,%0,$12     \n\t" \
+	" addu    %1,%1,$13     \n\t" \
+	" sltu   $15,%1,$13     \n\t" \
+	" addu    %1,%1,$14     \n\t" \
+	" sltu   $14,%1,$14     \n\t" \
+	" addu    %2,%2,$15     \n\t" \
+	" addu    %2,%2,$14     \n\t" \
+	\
+	" addu    %0,%0,$12     \n\t" \
+	" sltu   $14,%0,$12     \n\t" \
+	" addu    %1,%1,$13     \n\t" \
+	" sltu   $15,%1,$13     \n\t" \
+	" addu    %1,%1,$14     \n\t" \
+	" sltu   $14,%1,$14     \n\t" \
+	" addu    %2,%2,$15     \n\t" \
+	" addu    %2,%2,$14     \n\t" \
+	:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12", "$13", "$14", "$15");
+
+#define SQRADDSC(i, j)             \
+asm(                               \
+	" multu  %6,%7          \n\t"  \
+	" mflo   %0             \n\t"  \
+	" mfhi   %1             \n\t"  \
+	" xor    %2,%2,%2       \n\t"  \
+	:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "%cc");
+
+#define SQRADDAC(i, j)            \
+asm(                              \
+	" multu  %6,%7          \n\t" \
+	" mflo   $12            \n\t" \
+	" mfhi   $13            \n\t" \
+	" addu    %0,%0,$12     \n\t" \
+	" sltu   $12,%0,$12     \n\t" \
+	" addu    %1,%1,$13     \n\t" \
+	" sltu   $13,%1,$13     \n\t" \
+	" addu    %1,%1,$12     \n\t" \
+	" sltu   $12,%1,$12     \n\t" \
+	" addu    %2,%2,$13     \n\t" \
+	" addu    %2,%2,$12     \n\t" \
+	:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"$12", "$13", "$14");
+
+#define SQRADDDB                   \
+asm(                               \
+	" addu    %0,%0,%3       \n\t" \
+	" sltu   $10,%0,%3       \n\t" \
+	" addu    %1,%1,$10      \n\t" \
+	" sltu   $10,%1,$10      \n\t" \
+	" addu    %1,%1,%4       \n\t" \
+	" sltu   $11,%1,%4       \n\t" \
+	" addu    %2,%2,$10      \n\t" \
+	" addu    %2,%2,$11      \n\t" \
+	" addu    %2,%2,%5       \n\t" \
+	\
+	" addu    %0,%0,%3       \n\t" \
+	" sltu   $10,%0,%3       \n\t" \
+	" addu    %1,%1,$10      \n\t" \
+	" sltu   $10,%1,$10      \n\t" \
+	" addu    %1,%1,%4       \n\t" \
+	" sltu   $11,%1,%4       \n\t" \
+	" addu    %2,%2,$10      \n\t" \
+	" addu    %2,%2,$11      \n\t" \
+	" addu    %2,%2,%5       \n\t" \
+	:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "$10", "$11");
+
+#else
+/******************************************************************************/
+#define PSTM_ISO
+/* ISO C portable code */
+
+#define COMBA_START
+
+#define CLEAR_CARRY \
+   c0 = c1 = c2 = 0;
+
+#define COMBA_STORE(x) \
+   x = c0;
+
+#define COMBA_STORE2(x) \
+   x = c1;
+
+#define CARRY_FORWARD \
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+#define COMBA_FINI
+
+/* multiplies point i and j, updates carry "c1" and digit c2 */
+#define SQRADD(i, j)													\
+   do { pstm_word t;													\
+   t = c0 + ((pstm_word)i) * ((pstm_word)j);  c0 = (pstm_digit)t;		\
+   t = c1 + (t >> DIGIT_BIT);											\
+   c1 = (pstm_digit)t; c2 += (pstm_digit)(t >> DIGIT_BIT);				\
+   } while (0);
+
+
+/* for squaring some of the terms are doubled... */
+#define SQRADD2(i, j)											\
+   do { pstm_word t;											\
+   t  = ((pstm_word)i) * ((pstm_word)j);						\
+   tt = (pstm_word)c0 + t; c0 = (pstm_digit)tt;					\
+   tt = (pstm_word)c1 + (tt >> DIGIT_BIT);						\
+   c1 = (pstm_digit)tt; c2 += (pstm_digit)(tt >> DIGIT_BIT);	\
+   tt = (pstm_word)c0 + t; c0 = (pstm_digit)tt;					\
+   tt = (pstm_word)c1 + (tt >> DIGIT_BIT);						\
+   c1 = (pstm_digit)tt; c2 += (pstm_digit)(tt >> DIGIT_BIT);	\
+   } while (0);
+
+#define SQRADDSC(i, j)										\
+   do { pstm_word t;										\
+	  t =  ((pstm_word)i) * ((pstm_word)j);					\
+	  sc0 = (pstm_digit)t; sc1 = (pstm_digit)(t >> DIGIT_BIT); sc2 = 0;	\
+   } while (0);
+
+#define SQRADDAC(i, j)														\
+   do { pstm_word t;														\
+   t = ((pstm_word)sc0) + ((pstm_word)i) * ((pstm_word)j);					\
+   sc0 = (pstm_digit)t;														\
+   t = ((pstm_word)sc1) + (t >> DIGIT_BIT); sc1 = (pstm_digit)t;			\
+   sc2 += (pstm_digit)(t >> DIGIT_BIT);										\
+   } while (0);
+
+#define SQRADDDB															\
+   do { pstm_word t;														\
+   t = ((pstm_word)sc0) + ((pstm_word)sc0) + ((pstm_word)c0);				\
+   c0 = (pstm_digit)t;														\
+   t = ((pstm_word)sc1) + ((pstm_word)sc1) + c1 + (t >> DIGIT_BIT);			\
+   c1 = (pstm_digit)t;														\
+   c2 = c2 + sc2 + sc2 + (pstm_digit)(t >> DIGIT_BIT);						\
+   } while (0);
+
+#endif /* ISO_C */
+
+/******************************************************************************/
+/*
+	Non-unrolled comba squarer
+ */
+///bbox: pool unused
+#define pstm_sqr_comba_gen(pool, A, B, paD, paDlen) \
+        pstm_sqr_comba_gen(      A, B, paD, paDlen)
+static int32 pstm_sqr_comba_gen(psPool_t *pool, pstm_int *A, pstm_int *B,
+			pstm_digit *paD, uint32 paDlen)
+{
+	int16		paDfail, pa;
+	int32       ix, iz;
+	pstm_digit  c0, c1, c2, *dst;
+#ifdef PSTM_ISO
+	pstm_word   tt;
+#endif
+
+	paDfail = 0;
+	/* get size of output and trim */
+	pa = A->used + A->used;
+
+	/* number of output digits to produce */
+	COMBA_START;
+	CLEAR_CARRY;
+/*
+	If b is not large enough grow it and continue
+*/
+	if (B->alloc < pa) {
+		if (pstm_grow(B, pa) != PSTM_OKAY) {
+			return PS_MEM_FAIL;
+		}
+	}
+	if (paD != NULL) {
+		if (paDlen < (sizeof(pstm_digit) * pa)) {
+			paDfail = 1; /* have a paD, but it's not big enough */
+			dst = xzalloc(sizeof(pstm_digit) * pa);
+		} else {
+			dst = paD;
+			memset(dst, 0x0, paDlen);
+		}
+	} else {
+		dst = xzalloc(sizeof(pstm_digit) * pa);
+	}
+
+	for (ix = 0; ix < pa; ix++) {
+		int32      tx, ty, iy;
+		pstm_digit *tmpy, *tmpx;
+
+		/* get offsets into the two bignums */
+		ty = min(A->used-1, ix);
+		tx = ix - ty;
+
+		/* setup temp aliases */
+		tmpx = A->dp + tx;
+		tmpy = A->dp + ty;
+
+/*
+			This is the number of times the loop will iterate,
+				while (tx++ < a->used && ty-- >= 0) { ... }
+*/
+		iy = min(A->used-tx, ty+1);
+
+/*
+		now for squaring tx can never equal ty. We halve the distance since
+		they approach at a rate of 2x and we have to round because odd cases
+		need to be executed
+*/
+		iy = min(iy, (ty-tx+1)>>1);
+
+		/* forward carries */
+		CARRY_FORWARD;
+
+		/* execute loop */
+		for (iz = 0; iz < iy; iz++) {
+			SQRADD2(*tmpx++, *tmpy--);
+		}
+
+		/* even columns have the square term in them */
+		if ((ix&1) == 0) {
+			SQRADD(A->dp[ix>>1], A->dp[ix>>1]);
+		}
+
+		/* store it */
+		COMBA_STORE(dst[ix]);
+	}
+
+	COMBA_FINI;
+/*
+	setup dest
+ */
+	iz  = B->used;
+	B->used = pa;
+	{
+		pstm_digit *tmpc;
+		tmpc = B->dp;
+		for (ix = 0; ix < pa; ix++) {
+			*tmpc++ = dst[ix];
+		}
+		/*	clear unused digits (that existed in the old copy of c) */
+		for (; ix < iz; ix++) {
+			*tmpc++ = 0;
+		}
+	}
+	pstm_clamp(B);
+
+	if ((paD == NULL) || paDfail == 1) {
+		psFree(dst, pool);
+	}
+	return PS_SUCCESS;
+}
+
+/******************************************************************************/
+/*
+	Unrolled Comba loop for 1024 bit keys
+ */
+#ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS
+static int32 pstm_sqr_comba16(pstm_int *A, pstm_int *B)
+{
+	pstm_digit *a, b[32], c0, c1, c2, sc0, sc1, sc2;
+#ifdef PSTM_ISO
+	pstm_word   tt;
+#endif
+
+	if (B->alloc < 32) {
+		if (pstm_grow(B, 32) != PSTM_OKAY) {
+			return PS_MEM_FAIL;
+		}
+	}
+	a = A->dp;
+	sc0 = sc1 = sc2 = 0;
+
+	COMBA_START;
+
+   /* clear carries */
+   CLEAR_CARRY;
+
+   /* output 0 */
+   SQRADD(a[0],a[0]);
+   COMBA_STORE(b[0]);
+
+   /* output 1 */
+   CARRY_FORWARD;
+   SQRADD2(a[0], a[1]);
+   COMBA_STORE(b[1]);
+
+   /* output 2 */
+   CARRY_FORWARD;
+   SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]);
+   COMBA_STORE(b[2]);
+
+   /* output 3 */
+   CARRY_FORWARD;
+   SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]);
+   COMBA_STORE(b[3]);
+
+   /* output 4 */
+   CARRY_FORWARD;
+   SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]);
+   COMBA_STORE(b[4]);
+
+   /* output 5 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB;
+   COMBA_STORE(b[5]);
+
+   /* output 6 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]);
+   COMBA_STORE(b[6]);
+
+   /* output 7 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB;
+   COMBA_STORE(b[7]);
+
+   /* output 8 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]);
+   COMBA_STORE(b[8]);
+
+   /* output 9 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB;
+   COMBA_STORE(b[9]);
+
+   /* output 10 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]);
+   COMBA_STORE(b[10]);
+
+   /* output 11 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB;
+   COMBA_STORE(b[11]);
+
+   /* output 12 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]);
+   COMBA_STORE(b[12]);
+
+   /* output 13 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB;
+   COMBA_STORE(b[13]);
+
+   /* output 14 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]);
+   COMBA_STORE(b[14]);
+
+   /* output 15 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB;
+   COMBA_STORE(b[15]);
+
+   /* output 16 */
+   CARRY_FORWARD;
+   SQRADDSC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]);
+   COMBA_STORE(b[16]);
+
+   /* output 17 */
+   CARRY_FORWARD;
+   SQRADDSC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB;
+   COMBA_STORE(b[17]);
+
+   /* output 18 */
+   CARRY_FORWARD;
+   SQRADDSC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]);
+   COMBA_STORE(b[18]);
+
+   /* output 19 */
+   CARRY_FORWARD;
+   SQRADDSC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB;
+   COMBA_STORE(b[19]);
+
+   /* output 20 */
+   CARRY_FORWARD;
+   SQRADDSC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]);
+   COMBA_STORE(b[20]);
+
+   /* output 21 */
+   CARRY_FORWARD;
+   SQRADDSC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB;
+   COMBA_STORE(b[21]);
+
+   /* output 22 */
+   CARRY_FORWARD;
+   SQRADDSC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]);
+   COMBA_STORE(b[22]);
+
+   /* output 23 */
+   CARRY_FORWARD;
+   SQRADDSC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB;
+   COMBA_STORE(b[23]);
+
+   /* output 24 */
+   CARRY_FORWARD;
+   SQRADDSC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]);
+   COMBA_STORE(b[24]);
+
+   /* output 25 */
+   CARRY_FORWARD;
+   SQRADDSC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB;
+   COMBA_STORE(b[25]);
+
+   /* output 26 */
+   CARRY_FORWARD;
+   SQRADD2(a[11], a[15]); SQRADD2(a[12], a[14]); SQRADD(a[13], a[13]);
+   COMBA_STORE(b[26]);
+
+   /* output 27 */
+   CARRY_FORWARD;
+   SQRADD2(a[12], a[15]); SQRADD2(a[13], a[14]);
+   COMBA_STORE(b[27]);
+
+   /* output 28 */
+   CARRY_FORWARD;
+   SQRADD2(a[13], a[15]); SQRADD(a[14], a[14]);
+   COMBA_STORE(b[28]);
+
+   /* output 29 */
+   CARRY_FORWARD;
+   SQRADD2(a[14], a[15]);
+   COMBA_STORE(b[29]);
+
+   /* output 30 */
+   CARRY_FORWARD;
+   SQRADD(a[15], a[15]);
+   COMBA_STORE(b[30]);
+   COMBA_STORE2(b[31]);
+   COMBA_FINI;
+
+   B->used = 32;
+   B->sign = PSTM_ZPOS;
+   memcpy(B->dp, b, 32 * sizeof(pstm_digit));
+   pstm_clamp(B);
+   return PSTM_OKAY;
+}
+#endif /* USE_1024_KEY_SPEED_OPTIMIZATIONS */
+
+
+#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
+static int32 pstm_sqr_comba32(pstm_int *A, pstm_int *B)
+{
+   pstm_digit *a, b[64], c0, c1, c2, sc0, sc1, sc2;
+#ifdef PSTM_ISO
+   pstm_word tt;
+#endif
+
+	if (B->alloc < 64) {
+		if (pstm_grow(B, 64) != PSTM_OKAY) {
+			return PS_MEM_FAIL;
+		}
+	}
+	sc0 = sc1 = sc2 = 0;
+   a = A->dp;
+   COMBA_START;
+
+   /* clear carries */
+   CLEAR_CARRY;
+
+   /* output 0 */
+   SQRADD(a[0],a[0]);
+   COMBA_STORE(b[0]);
+
+   /* output 1 */
+   CARRY_FORWARD;
+   SQRADD2(a[0], a[1]);
+   COMBA_STORE(b[1]);
+
+   /* output 2 */
+   CARRY_FORWARD;
+   SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]);
+   COMBA_STORE(b[2]);
+
+   /* output 3 */
+   CARRY_FORWARD;
+   SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]);
+   COMBA_STORE(b[3]);
+
+   /* output 4 */
+   CARRY_FORWARD;
+   SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]);
+   COMBA_STORE(b[4]);
+
+   /* output 5 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB;
+   COMBA_STORE(b[5]);
+
+   /* output 6 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]);
+   COMBA_STORE(b[6]);
+
+   /* output 7 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB;
+   COMBA_STORE(b[7]);
+
+   /* output 8 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]);
+   COMBA_STORE(b[8]);
+
+   /* output 9 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB;
+   COMBA_STORE(b[9]);
+
+   /* output 10 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]);
+   COMBA_STORE(b[10]);
+
+   /* output 11 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB;
+   COMBA_STORE(b[11]);
+
+   /* output 12 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]);
+   COMBA_STORE(b[12]);
+
+   /* output 13 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB;
+   COMBA_STORE(b[13]);
+
+   /* output 14 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]);
+   COMBA_STORE(b[14]);
+
+   /* output 15 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB;
+   COMBA_STORE(b[15]);
+
+   /* output 16 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[16]); SQRADDAC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]);
+   COMBA_STORE(b[16]);
+
+   /* output 17 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[17]); SQRADDAC(a[1], a[16]); SQRADDAC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB;
+   COMBA_STORE(b[17]);
+
+   /* output 18 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[18]); SQRADDAC(a[1], a[17]); SQRADDAC(a[2], a[16]); SQRADDAC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]);
+   COMBA_STORE(b[18]);
+
+   /* output 19 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[19]); SQRADDAC(a[1], a[18]); SQRADDAC(a[2], a[17]); SQRADDAC(a[3], a[16]); SQRADDAC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB;
+   COMBA_STORE(b[19]);
+
+   /* output 20 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[20]); SQRADDAC(a[1], a[19]); SQRADDAC(a[2], a[18]); SQRADDAC(a[3], a[17]); SQRADDAC(a[4], a[16]); SQRADDAC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]);
+   COMBA_STORE(b[20]);
+
+   /* output 21 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[21]); SQRADDAC(a[1], a[20]); SQRADDAC(a[2], a[19]); SQRADDAC(a[3], a[18]); SQRADDAC(a[4], a[17]); SQRADDAC(a[5], a[16]); SQRADDAC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB;
+   COMBA_STORE(b[21]);
+
+   /* output 22 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[22]); SQRADDAC(a[1], a[21]); SQRADDAC(a[2], a[20]); SQRADDAC(a[3], a[19]); SQRADDAC(a[4], a[18]); SQRADDAC(a[5], a[17]); SQRADDAC(a[6], a[16]); SQRADDAC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]);
+   COMBA_STORE(b[22]);
+
+   /* output 23 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[23]); SQRADDAC(a[1], a[22]); SQRADDAC(a[2], a[21]); SQRADDAC(a[3], a[20]); SQRADDAC(a[4], a[19]); SQRADDAC(a[5], a[18]); SQRADDAC(a[6], a[17]); SQRADDAC(a[7], a[16]); SQRADDAC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB;
+   COMBA_STORE(b[23]);
+
+   /* output 24 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[24]); SQRADDAC(a[1], a[23]); SQRADDAC(a[2], a[22]); SQRADDAC(a[3], a[21]); SQRADDAC(a[4], a[20]); SQRADDAC(a[5], a[19]); SQRADDAC(a[6], a[18]); SQRADDAC(a[7], a[17]); SQRADDAC(a[8], a[16]); SQRADDAC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]);
+   COMBA_STORE(b[24]);
+
+   /* output 25 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[25]); SQRADDAC(a[1], a[24]); SQRADDAC(a[2], a[23]); SQRADDAC(a[3], a[22]); SQRADDAC(a[4], a[21]); SQRADDAC(a[5], a[20]); SQRADDAC(a[6], a[19]); SQRADDAC(a[7], a[18]); SQRADDAC(a[8], a[17]); SQRADDAC(a[9], a[16]); SQRADDAC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB;
+   COMBA_STORE(b[25]);
+
+   /* output 26 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[26]); SQRADDAC(a[1], a[25]); SQRADDAC(a[2], a[24]); SQRADDAC(a[3], a[23]); SQRADDAC(a[4], a[22]); SQRADDAC(a[5], a[21]); SQRADDAC(a[6], a[20]); SQRADDAC(a[7], a[19]); SQRADDAC(a[8], a[18]); SQRADDAC(a[9], a[17]); SQRADDAC(a[10], a[16]); SQRADDAC(a[11], a[15]); SQRADDAC(a[12], a[14]); SQRADDDB; SQRADD(a[13], a[13]);
+   COMBA_STORE(b[26]);
+
+   /* output 27 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[27]); SQRADDAC(a[1], a[26]); SQRADDAC(a[2], a[25]); SQRADDAC(a[3], a[24]); SQRADDAC(a[4], a[23]); SQRADDAC(a[5], a[22]); SQRADDAC(a[6], a[21]); SQRADDAC(a[7], a[20]); SQRADDAC(a[8], a[19]); SQRADDAC(a[9], a[18]); SQRADDAC(a[10], a[17]); SQRADDAC(a[11], a[16]); SQRADDAC(a[12], a[15]); SQRADDAC(a[13], a[14]); SQRADDDB;
+   COMBA_STORE(b[27]);
+
+   /* output 28 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[28]); SQRADDAC(a[1], a[27]); SQRADDAC(a[2], a[26]); SQRADDAC(a[3], a[25]); SQRADDAC(a[4], a[24]); SQRADDAC(a[5], a[23]); SQRADDAC(a[6], a[22]); SQRADDAC(a[7], a[21]); SQRADDAC(a[8], a[20]); SQRADDAC(a[9], a[19]); SQRADDAC(a[10], a[18]); SQRADDAC(a[11], a[17]); SQRADDAC(a[12], a[16]); SQRADDAC(a[13], a[15]); SQRADDDB; SQRADD(a[14], a[14]);
+   COMBA_STORE(b[28]);
+
+   /* output 29 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[29]); SQRADDAC(a[1], a[28]); SQRADDAC(a[2], a[27]); SQRADDAC(a[3], a[26]); SQRADDAC(a[4], a[25]); SQRADDAC(a[5], a[24]); SQRADDAC(a[6], a[23]); SQRADDAC(a[7], a[22]); SQRADDAC(a[8], a[21]); SQRADDAC(a[9], a[20]); SQRADDAC(a[10], a[19]); SQRADDAC(a[11], a[18]); SQRADDAC(a[12], a[17]); SQRADDAC(a[13], a[16]); SQRADDAC(a[14], a[15]); SQRADDDB;
+   COMBA_STORE(b[29]);
+
+   /* output 30 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[30]); SQRADDAC(a[1], a[29]); SQRADDAC(a[2], a[28]); SQRADDAC(a[3], a[27]); SQRADDAC(a[4], a[26]); SQRADDAC(a[5], a[25]); SQRADDAC(a[6], a[24]); SQRADDAC(a[7], a[23]); SQRADDAC(a[8], a[22]); SQRADDAC(a[9], a[21]); SQRADDAC(a[10], a[20]); SQRADDAC(a[11], a[19]); SQRADDAC(a[12], a[18]); SQRADDAC(a[13], a[17]); SQRADDAC(a[14], a[16]); SQRADDDB; SQRADD(a[15], a[15]);
+   COMBA_STORE(b[30]);
+
+   /* output 31 */
+   CARRY_FORWARD;
+   SQRADDSC(a[0], a[31]); SQRADDAC(a[1], a[30]); SQRADDAC(a[2], a[29]); SQRADDAC(a[3], a[28]); SQRADDAC(a[4], a[27]); SQRADDAC(a[5], a[26]); SQRADDAC(a[6], a[25]); SQRADDAC(a[7], a[24]); SQRADDAC(a[8], a[23]); SQRADDAC(a[9], a[22]); SQRADDAC(a[10], a[21]); SQRADDAC(a[11], a[20]); SQRADDAC(a[12], a[19]); SQRADDAC(a[13], a[18]); SQRADDAC(a[14], a[17]); SQRADDAC(a[15], a[16]); SQRADDDB;
+   COMBA_STORE(b[31]);
+
+   /* output 32 */
+   CARRY_FORWARD;
+   SQRADDSC(a[1], a[31]); SQRADDAC(a[2], a[30]); SQRADDAC(a[3], a[29]); SQRADDAC(a[4], a[28]); SQRADDAC(a[5], a[27]); SQRADDAC(a[6], a[26]); SQRADDAC(a[7], a[25]); SQRADDAC(a[8], a[24]); SQRADDAC(a[9], a[23]); SQRADDAC(a[10], a[22]); SQRADDAC(a[11], a[21]); SQRADDAC(a[12], a[20]); SQRADDAC(a[13], a[19]); SQRADDAC(a[14], a[18]); SQRADDAC(a[15], a[17]); SQRADDDB; SQRADD(a[16], a[16]);
+   COMBA_STORE(b[32]);
+
+   /* output 33 */
+   CARRY_FORWARD;
+   SQRADDSC(a[2], a[31]); SQRADDAC(a[3], a[30]); SQRADDAC(a[4], a[29]); SQRADDAC(a[5], a[28]); SQRADDAC(a[6], a[27]); SQRADDAC(a[7], a[26]); SQRADDAC(a[8], a[25]); SQRADDAC(a[9], a[24]); SQRADDAC(a[10], a[23]); SQRADDAC(a[11], a[22]); SQRADDAC(a[12], a[21]); SQRADDAC(a[13], a[20]); SQRADDAC(a[14], a[19]); SQRADDAC(a[15], a[18]); SQRADDAC(a[16], a[17]); SQRADDDB;
+   COMBA_STORE(b[33]);
+
+   /* output 34 */
+   CARRY_FORWARD;
+   SQRADDSC(a[3], a[31]); SQRADDAC(a[4], a[30]); SQRADDAC(a[5], a[29]); SQRADDAC(a[6], a[28]); SQRADDAC(a[7], a[27]); SQRADDAC(a[8], a[26]); SQRADDAC(a[9], a[25]); SQRADDAC(a[10], a[24]); SQRADDAC(a[11], a[23]); SQRADDAC(a[12], a[22]); SQRADDAC(a[13], a[21]); SQRADDAC(a[14], a[20]); SQRADDAC(a[15], a[19]); SQRADDAC(a[16], a[18]); SQRADDDB; SQRADD(a[17], a[17]);
+   COMBA_STORE(b[34]);
+
+   /* output 35 */
+   CARRY_FORWARD;
+   SQRADDSC(a[4], a[31]); SQRADDAC(a[5], a[30]); SQRADDAC(a[6], a[29]); SQRADDAC(a[7], a[28]); SQRADDAC(a[8], a[27]); SQRADDAC(a[9], a[26]); SQRADDAC(a[10], a[25]); SQRADDAC(a[11], a[24]); SQRADDAC(a[12], a[23]); SQRADDAC(a[13], a[22]); SQRADDAC(a[14], a[21]); SQRADDAC(a[15], a[20]); SQRADDAC(a[16], a[19]); SQRADDAC(a[17], a[18]); SQRADDDB;
+   COMBA_STORE(b[35]);
+
+   /* output 36 */
+   CARRY_FORWARD;
+   SQRADDSC(a[5], a[31]); SQRADDAC(a[6], a[30]); SQRADDAC(a[7], a[29]); SQRADDAC(a[8], a[28]); SQRADDAC(a[9], a[27]); SQRADDAC(a[10], a[26]); SQRADDAC(a[11], a[25]); SQRADDAC(a[12], a[24]); SQRADDAC(a[13], a[23]); SQRADDAC(a[14], a[22]); SQRADDAC(a[15], a[21]); SQRADDAC(a[16], a[20]); SQRADDAC(a[17], a[19]); SQRADDDB; SQRADD(a[18], a[18]);
+   COMBA_STORE(b[36]);
+
+   /* output 37 */
+   CARRY_FORWARD;
+   SQRADDSC(a[6], a[31]); SQRADDAC(a[7], a[30]); SQRADDAC(a[8], a[29]); SQRADDAC(a[9], a[28]); SQRADDAC(a[10], a[27]); SQRADDAC(a[11], a[26]); SQRADDAC(a[12], a[25]); SQRADDAC(a[13], a[24]); SQRADDAC(a[14], a[23]); SQRADDAC(a[15], a[22]); SQRADDAC(a[16], a[21]); SQRADDAC(a[17], a[20]); SQRADDAC(a[18], a[19]); SQRADDDB;
+   COMBA_STORE(b[37]);
+
+   /* output 38 */
+   CARRY_FORWARD;
+   SQRADDSC(a[7], a[31]); SQRADDAC(a[8], a[30]); SQRADDAC(a[9], a[29]); SQRADDAC(a[10], a[28]); SQRADDAC(a[11], a[27]); SQRADDAC(a[12], a[26]); SQRADDAC(a[13], a[25]); SQRADDAC(a[14], a[24]); SQRADDAC(a[15], a[23]); SQRADDAC(a[16], a[22]); SQRADDAC(a[17], a[21]); SQRADDAC(a[18], a[20]); SQRADDDB; SQRADD(a[19], a[19]);
+   COMBA_STORE(b[38]);
+
+   /* output 39 */
+   CARRY_FORWARD;
+   SQRADDSC(a[8], a[31]); SQRADDAC(a[9], a[30]); SQRADDAC(a[10], a[29]); SQRADDAC(a[11], a[28]); SQRADDAC(a[12], a[27]); SQRADDAC(a[13], a[26]); SQRADDAC(a[14], a[25]); SQRADDAC(a[15], a[24]); SQRADDAC(a[16], a[23]); SQRADDAC(a[17], a[22]); SQRADDAC(a[18], a[21]); SQRADDAC(a[19], a[20]); SQRADDDB;
+   COMBA_STORE(b[39]);
+
+   /* output 40 */
+   CARRY_FORWARD;
+   SQRADDSC(a[9], a[31]); SQRADDAC(a[10], a[30]); SQRADDAC(a[11], a[29]); SQRADDAC(a[12], a[28]); SQRADDAC(a[13], a[27]); SQRADDAC(a[14], a[26]); SQRADDAC(a[15], a[25]); SQRADDAC(a[16], a[24]); SQRADDAC(a[17], a[23]); SQRADDAC(a[18], a[22]); SQRADDAC(a[19], a[21]); SQRADDDB; SQRADD(a[20], a[20]);
+   COMBA_STORE(b[40]);
+
+   /* output 41 */
+   CARRY_FORWARD;
+   SQRADDSC(a[10], a[31]); SQRADDAC(a[11], a[30]); SQRADDAC(a[12], a[29]); SQRADDAC(a[13], a[28]); SQRADDAC(a[14], a[27]); SQRADDAC(a[15], a[26]); SQRADDAC(a[16], a[25]); SQRADDAC(a[17], a[24]); SQRADDAC(a[18], a[23]); SQRADDAC(a[19], a[22]); SQRADDAC(a[20], a[21]); SQRADDDB;
+   COMBA_STORE(b[41]);
+
+   /* output 42 */
+   CARRY_FORWARD;
+   SQRADDSC(a[11], a[31]); SQRADDAC(a[12], a[30]); SQRADDAC(a[13], a[29]); SQRADDAC(a[14], a[28]); SQRADDAC(a[15], a[27]); SQRADDAC(a[16], a[26]); SQRADDAC(a[17], a[25]); SQRADDAC(a[18], a[24]); SQRADDAC(a[19], a[23]); SQRADDAC(a[20], a[22]); SQRADDDB; SQRADD(a[21], a[21]);
+   COMBA_STORE(b[42]);
+
+   /* output 43 */
+   CARRY_FORWARD;
+   SQRADDSC(a[12], a[31]); SQRADDAC(a[13], a[30]); SQRADDAC(a[14], a[29]); SQRADDAC(a[15], a[28]); SQRADDAC(a[16], a[27]); SQRADDAC(a[17], a[26]); SQRADDAC(a[18], a[25]); SQRADDAC(a[19], a[24]); SQRADDAC(a[20], a[23]); SQRADDAC(a[21], a[22]); SQRADDDB;
+   COMBA_STORE(b[43]);
+
+   /* output 44 */
+   CARRY_FORWARD;
+   SQRADDSC(a[13], a[31]); SQRADDAC(a[14], a[30]); SQRADDAC(a[15], a[29]); SQRADDAC(a[16], a[28]); SQRADDAC(a[17], a[27]); SQRADDAC(a[18], a[26]); SQRADDAC(a[19], a[25]); SQRADDAC(a[20], a[24]); SQRADDAC(a[21], a[23]); SQRADDDB; SQRADD(a[22], a[22]);
+   COMBA_STORE(b[44]);
+
+   /* output 45 */
+   CARRY_FORWARD;
+   SQRADDSC(a[14], a[31]); SQRADDAC(a[15], a[30]); SQRADDAC(a[16], a[29]); SQRADDAC(a[17], a[28]); SQRADDAC(a[18], a[27]); SQRADDAC(a[19], a[26]); SQRADDAC(a[20], a[25]); SQRADDAC(a[21], a[24]); SQRADDAC(a[22], a[23]); SQRADDDB;
+   COMBA_STORE(b[45]);
+
+   /* output 46 */
+   CARRY_FORWARD;
+   SQRADDSC(a[15], a[31]); SQRADDAC(a[16], a[30]); SQRADDAC(a[17], a[29]); SQRADDAC(a[18], a[28]); SQRADDAC(a[19], a[27]); SQRADDAC(a[20], a[26]); SQRADDAC(a[21], a[25]); SQRADDAC(a[22], a[24]); SQRADDDB; SQRADD(a[23], a[23]);
+   COMBA_STORE(b[46]);
+
+   /* output 47 */
+   CARRY_FORWARD;
+   SQRADDSC(a[16], a[31]); SQRADDAC(a[17], a[30]); SQRADDAC(a[18], a[29]); SQRADDAC(a[19], a[28]); SQRADDAC(a[20], a[27]); SQRADDAC(a[21], a[26]); SQRADDAC(a[22], a[25]); SQRADDAC(a[23], a[24]); SQRADDDB;
+   COMBA_STORE(b[47]);
+
+   /* output 48 */
+   CARRY_FORWARD;
+   SQRADDSC(a[17], a[31]); SQRADDAC(a[18], a[30]); SQRADDAC(a[19], a[29]); SQRADDAC(a[20], a[28]); SQRADDAC(a[21], a[27]); SQRADDAC(a[22], a[26]); SQRADDAC(a[23], a[25]); SQRADDDB; SQRADD(a[24], a[24]);
+   COMBA_STORE(b[48]);
+
+   /* output 49 */
+   CARRY_FORWARD;
+   SQRADDSC(a[18], a[31]); SQRADDAC(a[19], a[30]); SQRADDAC(a[20], a[29]); SQRADDAC(a[21], a[28]); SQRADDAC(a[22], a[27]); SQRADDAC(a[23], a[26]); SQRADDAC(a[24], a[25]); SQRADDDB;
+   COMBA_STORE(b[49]);
+
+   /* output 50 */
+   CARRY_FORWARD;
+   SQRADDSC(a[19], a[31]); SQRADDAC(a[20], a[30]); SQRADDAC(a[21], a[29]); SQRADDAC(a[22], a[28]); SQRADDAC(a[23], a[27]); SQRADDAC(a[24], a[26]); SQRADDDB; SQRADD(a[25], a[25]);
+   COMBA_STORE(b[50]);
+
+   /* output 51 */
+   CARRY_FORWARD;
+   SQRADDSC(a[20], a[31]); SQRADDAC(a[21], a[30]); SQRADDAC(a[22], a[29]); SQRADDAC(a[23], a[28]); SQRADDAC(a[24], a[27]); SQRADDAC(a[25], a[26]); SQRADDDB;
+   COMBA_STORE(b[51]);
+
+   /* output 52 */
+   CARRY_FORWARD;
+   SQRADDSC(a[21], a[31]); SQRADDAC(a[22], a[30]); SQRADDAC(a[23], a[29]); SQRADDAC(a[24], a[28]); SQRADDAC(a[25], a[27]); SQRADDDB; SQRADD(a[26], a[26]);
+   COMBA_STORE(b[52]);
+
+   /* output 53 */
+   CARRY_FORWARD;
+   SQRADDSC(a[22], a[31]); SQRADDAC(a[23], a[30]); SQRADDAC(a[24], a[29]); SQRADDAC(a[25], a[28]); SQRADDAC(a[26], a[27]); SQRADDDB;
+   COMBA_STORE(b[53]);
+
+   /* output 54 */
+   CARRY_FORWARD;
+   SQRADDSC(a[23], a[31]); SQRADDAC(a[24], a[30]); SQRADDAC(a[25], a[29]); SQRADDAC(a[26], a[28]); SQRADDDB; SQRADD(a[27], a[27]);
+   COMBA_STORE(b[54]);
+
+   /* output 55 */
+   CARRY_FORWARD;
+   SQRADDSC(a[24], a[31]); SQRADDAC(a[25], a[30]); SQRADDAC(a[26], a[29]); SQRADDAC(a[27], a[28]); SQRADDDB;
+   COMBA_STORE(b[55]);
+
+   /* output 56 */
+   CARRY_FORWARD;
+   SQRADDSC(a[25], a[31]); SQRADDAC(a[26], a[30]); SQRADDAC(a[27], a[29]); SQRADDDB; SQRADD(a[28], a[28]);
+   COMBA_STORE(b[56]);
+
+   /* output 57 */
+   CARRY_FORWARD;
+   SQRADDSC(a[26], a[31]); SQRADDAC(a[27], a[30]); SQRADDAC(a[28], a[29]); SQRADDDB;
+   COMBA_STORE(b[57]);
+
+   /* output 58 */
+   CARRY_FORWARD;
+   SQRADD2(a[27], a[31]); SQRADD2(a[28], a[30]); SQRADD(a[29], a[29]);
+   COMBA_STORE(b[58]);
+
+   /* output 59 */
+   CARRY_FORWARD;
+   SQRADD2(a[28], a[31]); SQRADD2(a[29], a[30]);
+   COMBA_STORE(b[59]);
+
+   /* output 60 */
+   CARRY_FORWARD;
+   SQRADD2(a[29], a[31]); SQRADD(a[30], a[30]);
+   COMBA_STORE(b[60]);
+
+   /* output 61 */
+   CARRY_FORWARD;
+   SQRADD2(a[30], a[31]);
+   COMBA_STORE(b[61]);
+
+   /* output 62 */
+   CARRY_FORWARD;
+   SQRADD(a[31], a[31]);
+   COMBA_STORE(b[62]);
+   COMBA_STORE2(b[63]);
+   COMBA_FINI;
+
+   B->used = 64;
+   B->sign = PSTM_ZPOS;
+   memcpy(B->dp, b, 64 * sizeof(pstm_digit));
+   pstm_clamp(B);
+   return PSTM_OKAY;
+}
+#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
+
+/******************************************************************************/
+/*
+ */
+int32 pstm_sqr_comba(psPool_t *pool, pstm_int *A, pstm_int *B, pstm_digit *paD,
+		uint32 paDlen)
+{
+#ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS
+	if (A->used == 16) {
+		return pstm_sqr_comba16(A, B);
+	} else {
+#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
+		if (A->used == 32) {
+			return pstm_sqr_comba32(A, B);
+		}
+#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
+		return pstm_sqr_comba_gen(pool, A, B, paD, paDlen);
+	}
+#else
+#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
+	if (A->used == 32) {
+		return pstm_sqr_comba32(A, B);
+	}
+#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
+	return pstm_sqr_comba_gen(pool, A, B, paD, paDlen);
+#endif
+}
+
+#endif /* DISABLE_PSTM */
+/******************************************************************************/
diff --git a/networking/tls_rsa.c b/networking/tls_rsa.c
new file mode 100644
index 0000000..058b09c
--- /dev/null
+++ b/networking/tls_rsa.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (C) 2017 Denys Vlasenko
+ *
+ * Licensed under GPLv2, see file LICENSE in this source tree.
+ */
+#include "tls.h"
+
+#define pkcs1Pad(in, inlen, out, outlen, cryptType, userPtr) \
+        pkcs1Pad(in, inlen, out, outlen, cryptType)
+static ///bbox
+int32 pkcs1Pad(unsigned char *in, uint32 inlen, unsigned char *out,
+                                           uint32 outlen, int32 cryptType, void *userPtr)
+{
+        unsigned char   *c;
+        int32           randomLen;
+
+        randomLen = outlen - 3 - inlen;
+        if (randomLen < 8) {
+                psTraceCrypto("pkcs1Pad failure\n");
+                return PS_LIMIT_FAIL;
+        }
+        c = out;
+        *c = 0x00;
+        c++;
+        *c = (unsigned char)cryptType;
+        c++;
+        if (cryptType == PUBKEY_TYPE) {
+                while (randomLen-- > 0) {
+                        *c++ = 0xFF;
+                }
+        } else {
+                if (matrixCryptoGetPrngData(c, (uint32)randomLen, userPtr) < 0) {
+                        return PS_PLATFORM_FAIL;
+                }
+/*
+                SECURITY:  Read through the random data and change all 0x0 to 0x01.
+                This is per spec that no random bytes should be 0
+*/
+                while (randomLen-- > 0) {
+                        if (*c == 0x0) {
+                                *c = 0x01;
+                        }
+                        c++;
+                }
+        }
+        *c = 0x00;
+        c++;
+        memcpy(c, in, inlen);
+
+        return outlen;
+}
+
+#define psRsaCrypt(pool, in, inlen, out, outlen, key, type, data) \
+        psRsaCrypt(pool, in, inlen, out, outlen, key, type)
+static ///bbox
+int32 psRsaCrypt(psPool_t *pool, const unsigned char *in, uint32 inlen,
+			unsigned char *out, uint32 *outlen,	psRsaKey_t *key, int32 type,
+			void *data)
+{
+	pstm_int		tmp, tmpa, tmpb;
+	int32			res;
+	uint32			x;
+
+	if (in == NULL || out == NULL || outlen == NULL || key == NULL) {
+		psTraceCrypto("NULL parameter error in psRsaCrypt\n");
+		return PS_ARG_FAIL;
+	}
+
+	tmp.dp = tmpa.dp = tmpb.dp = NULL;
+
+	/* Init and copy into tmp */
+	if (pstm_init_for_read_unsigned_bin(pool, &tmp, inlen + sizeof(pstm_digit))
+			!= PS_SUCCESS) {
+		return PS_FAILURE;
+	}
+	if (pstm_read_unsigned_bin(&tmp, (unsigned char *)in, inlen) != PS_SUCCESS){
+		pstm_clear(&tmp);
+		return PS_FAILURE;
+	}
+	/* Sanity check on the input */
+	if (pstm_cmp(&key->N, &tmp) == PSTM_LT) {
+		res = PS_LIMIT_FAIL;
+		goto done;
+	}
+	if (type == PRIVKEY_TYPE) {
+		if (key->optimized) {
+			if (pstm_init_size(pool, &tmpa, key->p.alloc) != PS_SUCCESS) {
+				res = PS_FAILURE;
+				goto done;
+			}
+			if (pstm_init_size(pool, &tmpb, key->q.alloc) != PS_SUCCESS) {
+				pstm_clear(&tmpa);
+				res = PS_FAILURE;
+				goto done;
+			}
+			if (pstm_exptmod(pool, &tmp, &key->dP, &key->p, &tmpa) !=
+					PS_SUCCESS) {
+				psTraceCrypto("decrypt error: pstm_exptmod dP, p\n");
+				goto error;
+			}
+			if (pstm_exptmod(pool, &tmp, &key->dQ, &key->q, &tmpb) !=
+					PS_SUCCESS) {
+				psTraceCrypto("decrypt error: pstm_exptmod dQ, q\n");
+				goto error;
+			}
+			if (pstm_sub(&tmpa, &tmpb, &tmp) != PS_SUCCESS) {
+				psTraceCrypto("decrypt error: sub tmpb, tmp\n");
+				goto error;
+			}
+			if (pstm_mulmod(pool, &tmp, &key->qP, &key->p, &tmp) != PS_SUCCESS) {
+				psTraceCrypto("decrypt error: pstm_mulmod qP, p\n");
+				goto error;
+			}
+			if (pstm_mul_comba(pool, &tmp, &key->q, &tmp, NULL, 0)
+					!= PS_SUCCESS){
+				psTraceCrypto("decrypt error: pstm_mul q \n");
+				goto error;
+			}
+			if (pstm_add(&tmp, &tmpb, &tmp) != PS_SUCCESS) {
+				psTraceCrypto("decrypt error: pstm_add tmp \n");
+				goto error;
+			}
+		} else {
+			if (pstm_exptmod(pool, &tmp, &key->d, &key->N, &tmp) !=
+					PS_SUCCESS) {
+				psTraceCrypto("psRsaCrypt error: pstm_exptmod\n");
+				goto error;
+			}
+		}
+	} else if (type == PUBKEY_TYPE) {
+		if (pstm_exptmod(pool, &tmp, &key->e, &key->N, &tmp) != PS_SUCCESS) {
+			psTraceCrypto("psRsaCrypt error: pstm_exptmod\n");
+			goto error;
+		}
+	} else {
+		psTraceCrypto("psRsaCrypt error: invalid type param\n");
+		goto error;
+	}
+	/* Read it back */
+	x = pstm_unsigned_bin_size(&key->N);
+
+	if ((uint32)x > *outlen) {
+		res = -1;
+		psTraceCrypto("psRsaCrypt error: pstm_unsigned_bin_size\n");
+		goto done;
+	}
+	/* We want the encrypted value to always be the key size.  Pad with 0x0 */
+	while ((uint32)x < (unsigned long)key->size) {
+		*out++ = 0x0;
+		x++;
+	}
+
+	*outlen = x;
+	/* Convert it */
+	memset(out, 0x0, x);
+
+	if (pstm_to_unsigned_bin(pool, &tmp, out+(x-pstm_unsigned_bin_size(&tmp)))
+			!= PS_SUCCESS) {
+		psTraceCrypto("psRsaCrypt error: pstm_to_unsigned_bin\n");
+		goto error;
+	}
+	/* Clean up and return */
+	res = PS_SUCCESS;
+	goto done;
+error:
+	res = PS_FAILURE;
+done:
+	if (type == PRIVKEY_TYPE && key->optimized) {
+		pstm_clear_multi(&tmpa, &tmpb, NULL, NULL, NULL, NULL, NULL, NULL);
+	}
+	pstm_clear(&tmp);
+	return res;
+}
+
+int32 psRsaEncryptPub(psPool_t *pool, psRsaKey_t *key,
+                                                unsigned char *in, uint32 inlen,
+                                                unsigned char *out, uint32 outlen, void *data)
+{
+        int32   err;
+        uint32  size;
+
+        size = key->size;
+        if (outlen < size) {
+                psTraceCrypto("Error on bad outlen parameter to psRsaEncryptPub\n");
+                return PS_ARG_FAIL;
+        }
+
+        if ((err = pkcs1Pad(in, inlen, out, size, PRIVKEY_TYPE, data))
+                        < PS_SUCCESS) {
+                psTraceCrypto("Error padding psRsaEncryptPub. Likely data too long\n");
+                return err;
+        }
+        if ((err = psRsaCrypt(pool, out, size, out, (uint32*)&outlen, key,
+                        PUBKEY_TYPE, data)) < PS_SUCCESS) {
+                psTraceCrypto("Error performing psRsaEncryptPub\n");
+                return err;
+        }
+        if (outlen != size) {
+                psTraceCrypto("Encrypted size error in psRsaEncryptPub\n");
+                return PS_FAILURE;
+        }
+        return size;
+}
diff --git a/networking/tls_rsa.h b/networking/tls_rsa.h
new file mode 100644
index 0000000..3281087
--- /dev/null
+++ b/networking/tls_rsa.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) 2017 Denys Vlasenko
+ *
+ * Licensed under GPLv2, see file LICENSE in this source tree.
+ */
+
+typedef struct {
+	pstm_int    e, d, N, qP, dP, dQ, p, q;
+	uint32      size;   /* Size of the key in bytes */
+	int32       optimized; /* 1 for optimized */
+	psPool_t *pool;
+} psRsaKey_t;
+
+#define psRsaEncryptPub(pool, key, in, inlen, out, outlen, data) \
+        psRsaEncryptPub(pool, key, in, inlen, out, outlen)
+int32 psRsaEncryptPub(psPool_t *pool, psRsaKey_t *key,
+                                                unsigned char *in, uint32 inlen,
+                                                unsigned char *out, uint32 outlen, void *data);