tls: fix the case of sp_256_mont_tpl_10() leaving striay high bits

It has no effect on correctness, but interferes with compating internal state
of different implementations.

function                                             old     new   delta
sp_256_proj_point_dbl_10                             443     451      +8
static.sp_256_mont_sub_10                             46      49      +3
------------------------------------------------------------------------------
(add/remove: 0/0 grow/shrink: 2/0 up/down: 11/0)               Total: 11 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
diff --git a/networking/tls.c b/networking/tls.c
index 7ae9e5a..4f0e2b6 100644
--- a/networking/tls.c
+++ b/networking/tls.c
@@ -2326,6 +2326,48 @@
 	const int INBUF_STEP = 4 * 1024;
 	struct pollfd pfds[2];
 
+#if 0
+// Debug aid for comparing P256 implementations.
+// Enable this, set SP_DEBUG and FIXED_SECRET to 1,
+// and add
+//	tls_run_copy_loop(NULL, 0);
+// e.g. at the very beginning of wget_main()
+//
+{
+//kbuild:lib-$(CONFIG_TLS) += tls_sp_c32_new.o
+	uint8_t ecc_pub_key32[2 * 32];
+	uint8_t pubkey2x32[2 * 32];
+	uint8_t premaster32[32];
+
+//Fixed input key:
+//	memset(ecc_pub_key32, 0xee, sizeof(ecc_pub_key32));
+//Fixed 000000000000000000000000000000000000ab000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+//	memset(ecc_pub_key32, 0x00, sizeof(ecc_pub_key32));
+//	ecc_pub_key32[18] = 0xab;
+//Random key:
+	tls_get_random(ecc_pub_key32, sizeof(ecc_pub_key32));
+//Biased random (almost all zeros or almost all ones):
+//	srand(time(NULL) ^ getpid());
+//	if (rand() & 1)
+//		memset(ecc_pub_key32, 0x00, sizeof(ecc_pub_key32));
+//	else
+//		memset(ecc_pub_key32, 0xff, sizeof(ecc_pub_key32));
+//	ecc_pub_key32[rand() & 0x3f] = rand();
+
+	xmove_fd(xopen("p256.OLD", O_WRONLY | O_CREAT | O_TRUNC), 2);
+	curve_P256_compute_pubkey_and_premaster(
+			pubkey2x32, premaster32,
+			/*point:*/ ecc_pub_key32
+	);
+	xmove_fd(xopen("p256.NEW", O_WRONLY | O_CREAT | O_TRUNC), 2);
+	curve_P256_compute_pubkey_and_premaster_NEW(
+			pubkey2x32, premaster32,
+			/*point:*/ ecc_pub_key32
+	);
+	exit(1);
+}
+#endif
+
 	pfds[0].fd = STDIN_FILENO;
 	pfds[0].events = POLLIN;
 	pfds[1].fd = tls->ifd;
diff --git a/networking/tls.h b/networking/tls.h
index 215e92b..eb0fdd4 100644
--- a/networking/tls.h
+++ b/networking/tls.h
@@ -117,3 +117,7 @@
 void curve_P256_compute_pubkey_and_premaster(
 		uint8_t *pubkey2x32, uint8_t *premaster32,
 		const uint8_t *peerkey2x32) FAST_FUNC;
+
+void curve_P256_compute_pubkey_and_premaster_NEW(
+		uint8_t *pubkey2x32, uint8_t *premaster32,
+		const uint8_t *peerkey2x32) FAST_FUNC;
diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c
index 99f9c68..bba22de 100644
--- a/networking/tls_sp_c32.c
+++ b/networking/tls_sp_c32.c
@@ -163,11 +163,13 @@
 	a[j] = 0;
 	for (i = 0; i < 20 && j >= 0; i++) {
 		b = 0;
-		a[j--] |= r[i] << s; b += 8 - s;
+		a[j--] |= r[i] << s;
+		b += 8 - s;
 		if (j < 0)
 			break;
 		while (b < 26) {
-			a[j--] = r[i] >> b; b += 8;
+			a[j--] = r[i] >> b;
+			b += 8;
 			if (j < 0)
 				break;
 		}
@@ -286,9 +288,10 @@
 {
 	sp_256_add_10(r, a, b);
 	sp_256_norm_10(r);
-	if ((r[9] >> 22) > 0)
+	if ((r[9] >> 22) > 0) {
 		sp_256_sub_10(r, r, m);
-	sp_256_norm_10(r);
+		sp_256_norm_10(r);
+	}
 }
 
 /* Subtract two Montgomery form numbers (r = a - b % m) */
@@ -296,10 +299,12 @@
 		const sp_digit* m)
 {
 	sp_256_sub_10(r, a, b);
-	if (r[9] >> 22)
-		sp_256_add_10(r, r, m);
 	sp_256_norm_10(r);
-	r[9] &= 0x03fffff; /* truncate to 22 bits */
+	if (r[9] >> 22) {
+		sp_256_add_10(r, r, m);
+		sp_256_norm_10(r);
+		r[9] &= 0x03fffff; /* truncate to 22 bits */
+	}
 }
 
 /* Double a Montgomery form number (r = a + a % m) */
@@ -317,14 +322,17 @@
 {
 	sp_256_add_10(r, a, a);
 	sp_256_norm_10(r);
-	if ((r[9] >> 22) > 0)
+	if ((r[9] >> 22) > 0) {
 		sp_256_sub_10(r, r, m);
-	sp_256_norm_10(r);
+		sp_256_norm_10(r);
+	}
 	sp_256_add_10(r, r, a);
 	sp_256_norm_10(r);
-	if ((r[9] >> 22) > 0)
+	if ((r[9] >> 22) > 0) {
 		sp_256_sub_10(r, r, m);
-	sp_256_norm_10(r);
+		sp_256_norm_10(r);
+	}
+	r[9] &= 0x03fffff; /* truncate to 22 bits */
 }
 
 /* Shift the result in the high 256 bits down to the bottom. */
@@ -650,6 +658,13 @@
 	if (r->infinity) /* If infinity, don't double */
 		return;
 
+	if (SP_DEBUG) {
+		/* unused part of t2, may result in spurios
+		 * differences in debug output. Clear it.
+		 */
+		memset(t2, 0, sizeof(t2));
+	}
+
 	/* T1 = Z * Z */
 	sp_256_mont_sqr_10(t1, r->z /*, p256_mod, p256_mp_mod*/);
 	/* Z = Y * Z */