tls: P256: x86-64 assembly
function old new delta
sp_256_mont_mul_8 127 155 +28
sp_256_proj_point_dbl_8 448 469 +21
sp_256_mont_sub_8 23 35 +12
sp_256_mont_dbl_8 26 38 +12
sp_256_sub_8 44 49 +5
sp_256_ecc_mulmod_8 1530 1535 +5
------------------------------------------------------------------------------
(add/remove: 0/0 grow/shrink: 6/0 up/down: 83/0) Total: 83 bytes
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c
index 5320477..14a7c70 100644
--- a/networking/tls_sp_c32.c
+++ b/networking/tls_sp_c32.c
@@ -195,6 +195,34 @@
: "memory"
);
return reg;
+#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
+ /* x86_64 has no alignment restrictions, and is little-endian,
+ * so 64-bit and 32-bit representations are identical */
+ uint64_t reg;
+ asm volatile (
+"\n movq (%0), %3"
+"\n addq (%1), %3"
+"\n movq %3, (%2)"
+"\n"
+"\n movq 1*8(%0), %3"
+"\n adcq 1*8(%1), %3"
+"\n movq %3, 1*8(%2)"
+"\n"
+"\n movq 2*8(%0), %3"
+"\n adcq 2*8(%1), %3"
+"\n movq %3, 2*8(%2)"
+"\n"
+"\n movq 3*8(%0), %3"
+"\n adcq 3*8(%1), %3"
+"\n movq %3, 3*8(%2)"
+"\n"
+"\n sbbq %3, %3"
+"\n"
+ : "=r" (a), "=r" (b), "=r" (r), "=r" (reg)
+ : "0" (a), "1" (b), "2" (r)
+ : "memory"
+ );
+ return reg;
#else
int i;
sp_digit carry;
@@ -265,6 +293,34 @@
: "memory"
);
return reg;
+#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
+ /* x86_64 has no alignment restrictions, and is little-endian,
+ * so 64-bit and 32-bit representations are identical */
+ uint64_t reg;
+ asm volatile (
+"\n movq (%0), %3"
+"\n subq (%1), %3"
+"\n movq %3, (%2)"
+"\n"
+"\n movq 1*8(%0), %3"
+"\n sbbq 1*8(%1), %3"
+"\n movq %3, 1*8(%2)"
+"\n"
+"\n movq 2*8(%0), %3"
+"\n sbbq 2*8(%1), %3"
+"\n movq %3, 2*8(%2)"
+"\n"
+"\n movq 3*8(%0), %3"
+"\n sbbq 3*8(%1), %3"
+"\n movq %3, 3*8(%2)"
+"\n"
+"\n sbbq %3, %3"
+"\n"
+ : "=r" (a), "=r" (b), "=r" (r), "=r" (reg)
+ : "0" (a), "1" (b), "2" (r)
+ : "memory"
+ );
+ return reg;
#else
int i;
sp_digit borrow;
@@ -380,6 +436,49 @@
}
r[15] = accl;
memcpy(r, rr, sizeof(rr));
+#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
+ /* x86_64 has no alignment restrictions, and is little-endian,
+ * so 64-bit and 32-bit representations are identical */
+ const uint64_t* aa = (const void*)a;
+ const uint64_t* bb = (const void*)b;
+ uint64_t rr[8];
+ int k;
+ uint64_t accl;
+ uint64_t acch;
+
+ acch = accl = 0;
+ for (k = 0; k < 7; k++) {
+ int i, j;
+ uint64_t acc_hi;
+ i = k - 3;
+ if (i < 0)
+ i = 0;
+ j = k - i;
+ acc_hi = 0;
+ do {
+////////////////////////
+// uint128_t m = ((uint128_t)a[i]) * b[j];
+// acc_hi:acch:accl += m;
+ asm volatile (
+ // aa[i] is already loaded in %%rax
+"\n mulq %7"
+"\n addq %%rax, %0"
+"\n adcq %%rdx, %1"
+"\n adcq $0, %2"
+ : "=rm" (accl), "=rm" (acch), "=rm" (acc_hi)
+ : "0" (accl), "1" (acch), "2" (acc_hi), "a" (aa[i]), "m" (bb[j])
+ : "cc", "dx"
+ );
+////////////////////////
+ j--;
+ i++;
+ } while (i != 4 && i <= k);
+ rr[k] = accl;
+ accl = acch;
+ acch = acc_hi;
+ }
+ rr[7] = accl;
+ memcpy(r, rr, sizeof(rr));
#elif 0
//TODO: arm assembly (untested)
sp_digit tmp[16];