tls: P256: x86-64 assembly

function                                             old     new   delta
sp_256_mont_mul_8                                    127     155     +28
sp_256_proj_point_dbl_8                              448     469     +21
sp_256_mont_sub_8                                     23      35     +12
sp_256_mont_dbl_8                                     26      38     +12
sp_256_sub_8                                          44      49      +5
sp_256_ecc_mulmod_8                                 1530    1535      +5
------------------------------------------------------------------------------
(add/remove: 0/0 grow/shrink: 6/0 up/down: 83/0)               Total: 83 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c
index 5320477..14a7c70 100644
--- a/networking/tls_sp_c32.c
+++ b/networking/tls_sp_c32.c
@@ -195,6 +195,34 @@
 		: "memory"
 	);
 	return reg;
+#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
+	/* x86_64 has no alignment restrictions, and is little-endian,
+	 * so 64-bit and 32-bit representations are identical */
+	uint64_t reg;
+	asm volatile (
+"\n		movq	(%0), %3"
+"\n		addq	(%1), %3"
+"\n		movq	%3, (%2)"
+"\n"
+"\n		movq	1*8(%0), %3"
+"\n		adcq	1*8(%1), %3"
+"\n		movq	%3, 1*8(%2)"
+"\n"
+"\n		movq	2*8(%0), %3"
+"\n		adcq	2*8(%1), %3"
+"\n		movq	%3, 2*8(%2)"
+"\n"
+"\n		movq	3*8(%0), %3"
+"\n		adcq	3*8(%1), %3"
+"\n		movq	%3, 3*8(%2)"
+"\n"
+"\n		sbbq	%3, %3"
+"\n"
+		: "=r" (a), "=r" (b), "=r" (r), "=r" (reg)
+		: "0" (a), "1" (b), "2" (r)
+		: "memory"
+	);
+	return reg;
 #else
 	int i;
 	sp_digit carry;
@@ -265,6 +293,34 @@
 		: "memory"
 	);
 	return reg;
+#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
+	/* x86_64 has no alignment restrictions, and is little-endian,
+	 * so 64-bit and 32-bit representations are identical */
+	uint64_t reg;
+	asm volatile (
+"\n		movq	(%0), %3"
+"\n		subq	(%1), %3"
+"\n		movq	%3, (%2)"
+"\n"
+"\n		movq	1*8(%0), %3"
+"\n		sbbq	1*8(%1), %3"
+"\n		movq	%3, 1*8(%2)"
+"\n"
+"\n		movq	2*8(%0), %3"
+"\n		sbbq	2*8(%1), %3"
+"\n		movq	%3, 2*8(%2)"
+"\n"
+"\n		movq	3*8(%0), %3"
+"\n		sbbq	3*8(%1), %3"
+"\n		movq	%3, 3*8(%2)"
+"\n"
+"\n		sbbq	%3, %3"
+"\n"
+		: "=r" (a), "=r" (b), "=r" (r), "=r" (reg)
+		: "0" (a), "1" (b), "2" (r)
+		: "memory"
+	);
+	return reg;
 #else
 	int i;
 	sp_digit borrow;
@@ -380,6 +436,49 @@
 	}
 	r[15] = accl;
 	memcpy(r, rr, sizeof(rr));
+#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
+	/* x86_64 has no alignment restrictions, and is little-endian,
+	 * so 64-bit and 32-bit representations are identical */
+	const uint64_t* aa = (const void*)a;
+	const uint64_t* bb = (const void*)b;
+	uint64_t rr[8];
+	int k;
+	uint64_t accl;
+	uint64_t acch;
+
+	acch = accl = 0;
+	for (k = 0; k < 7; k++) {
+		int i, j;
+		uint64_t acc_hi;
+		i = k - 3;
+		if (i < 0)
+			i = 0;
+		j = k - i;
+		acc_hi = 0;
+		do {
+////////////////////////
+//			uint128_t m = ((uint128_t)a[i]) * b[j];
+//			acc_hi:acch:accl += m;
+			asm volatile (
+			// aa[i] is already loaded in %%rax
+"\n			mulq	%7"
+"\n			addq	%%rax, %0"
+"\n			adcq	%%rdx, %1"
+"\n			adcq	$0, %2"
+			: "=rm" (accl), "=rm" (acch), "=rm" (acc_hi)
+			: "0" (accl), "1" (acch), "2" (acc_hi), "a" (aa[i]), "m" (bb[j])
+			: "cc", "dx"
+			);
+////////////////////////
+		        j--;
+			i++;
+		} while (i != 4 && i <= k);
+		rr[k] = accl;
+		accl = acch;
+		acch = acc_hi;
+	}
+	rr[7] = accl;
+	memcpy(r, rr, sizeof(rr));
 #elif 0
 	//TODO: arm assembly (untested)
 	sp_digit tmp[16];