tls: speed up xor'ing of aligned 16-byte buffers

function                                             old     new   delta
xorbuf_aligned_AES_BLOCK_SIZE                          -      23     +23
xwrite_encrypted                                     585     580      -5
aesgcm_GHASH                                         233     228      -5
GMULT                                                192     187      -5
------------------------------------------------------------------------------
(add/remove: 1/0 grow/shrink: 0/3 up/down: 23/-15)              Total: 8 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
diff --git a/networking/tls.c b/networking/tls.c
index 1f8c21f..b774340 100644
--- a/networking/tls.c
+++ b/networking/tls.c
@@ -357,6 +357,20 @@
 	xorbuf3(dst, dst, src, count);
 }
 
+void FAST_FUNC xorbuf_aligned_AES_BLOCK_SIZE(void *dst, const void *src)
+{
+	unsigned long *d = dst;
+	const unsigned long *s = src;
+	d[0] ^= s[0];
+#if ULONG_MAX <= 0xffffffffffffffff
+	d[1] ^= s[1];
+ #if ULONG_MAX == 0xffffffff
+	d[2] ^= s[2];
+	d[3] ^= s[3];
+ #endif
+#endif
+}
+
 /* Nondestructively see the current hash value */
 static unsigned sha_peek(md5sha_ctx_t *ctx, void *buffer)
 {
@@ -802,10 +816,10 @@
 {
 #define COUNTER(v) (*(uint32_t*)(v + 12))
 
-	uint8_t aad[13 + 3] ALIGNED(4);   /* +3 creates [16] buffer, simplifying GHASH() */
-	uint8_t nonce[12 + 4] ALIGNED(4); /* +4 creates space for AES block counter */
-	uint8_t scratch[AES_BLOCK_SIZE] ALIGNED(4); //[16]
-	uint8_t authtag[AES_BLOCK_SIZE] ALIGNED(4); //[16]
+	uint8_t aad[13 + 3] ALIGNED_long;   /* +3 creates [16] buffer, simplifying GHASH() */
+	uint8_t nonce[12 + 4] ALIGNED_long; /* +4 creates space for AES block counter */
+	uint8_t scratch[AES_BLOCK_SIZE] ALIGNED_long; //[16]
+	uint8_t authtag[AES_BLOCK_SIZE] ALIGNED_long; //[16]
 	uint8_t *buf;
 	struct record_hdr *xhdr;
 	unsigned remaining;
@@ -850,7 +864,7 @@
 	aesgcm_GHASH(tls->H, aad, /*sizeof(aad),*/ tls->outbuf + OUTBUF_PFX, size, authtag /*, sizeof(authtag)*/);
 	COUNTER(nonce) = htonl(1);
 	aes_encrypt_one_block(&tls->aes_encrypt, nonce, scratch);
-	xorbuf(authtag, scratch, sizeof(authtag));
+	xorbuf_aligned_AES_BLOCK_SIZE(authtag, scratch);
 
 	memcpy(buf, authtag, sizeof(authtag));
 #undef COUNTER
@@ -938,10 +952,10 @@
 {
 #define COUNTER(v) (*(uint32_t*)(v + 12))
 
-	//uint8_t aad[13 + 3] ALIGNED(4); /* +3 creates [16] buffer, simplifying GHASH() */
-	uint8_t nonce[12 + 4] ALIGNED(4); /* +4 creates space for AES block counter */
-	uint8_t scratch[AES_BLOCK_SIZE] ALIGNED(4); //[16]
-	//uint8_t authtag[AES_BLOCK_SIZE] ALIGNED(4); //[16]
+	//uint8_t aad[13 + 3] ALIGNED_long; /* +3 creates [16] buffer, simplifying GHASH() */
+	uint8_t nonce[12 + 4] ALIGNED_long; /* +4 creates space for AES block counter */
+	uint8_t scratch[AES_BLOCK_SIZE] ALIGNED_long; //[16]
+	//uint8_t authtag[AES_BLOCK_SIZE] ALIGNED_long; //[16]
 	unsigned remaining;
 	unsigned cnt;
 
@@ -973,7 +987,7 @@
 	//aesgcm_GHASH(tls->H, aad, tls->inbuf + RECHDR_LEN, size, authtag);
 	//COUNTER(nonce) = htonl(1);
 	//aes_encrypt_one_block(&tls->aes_encrypt, nonce, scratch);
-	//xorbuf(authtag, scratch, sizeof(authtag));
+	//xorbuf_aligned_AES_BLOCK_SIZE(authtag, scratch);
 
 	//memcmp(buf, authtag, sizeof(authtag)) || DIE("HASH DOES NOT MATCH!");
 #undef COUNTER
diff --git a/networking/tls.h b/networking/tls.h
index 4b0dc74..494ed78 100644
--- a/networking/tls.h
+++ b/networking/tls.h
@@ -81,8 +81,12 @@
 #define AES_BLOCK_SIZE  16
 
 void tls_get_random(void *buf, unsigned len) FAST_FUNC;
+
 void xorbuf(void* buf, const void* mask, unsigned count) FAST_FUNC;
 
+#define ALIGNED_long ALIGNED(sizeof(long))
+void xorbuf_aligned_AES_BLOCK_SIZE(void* buf, const void* mask) FAST_FUNC;
+
 #define matrixCryptoGetPrngData(buf, len, userPtr) (tls_get_random(buf, len), PS_SUCCESS)
 
 #define psFree(p, pool)    free(p)
diff --git a/networking/tls_aesgcm.c b/networking/tls_aesgcm.c
index db720e5..fd72540 100644
--- a/networking/tls_aesgcm.c
+++ b/networking/tls_aesgcm.c
@@ -50,8 +50,8 @@
 
 static void GMULT(byte* X, byte* Y)
 {
-    byte Z[AES_BLOCK_SIZE];
-    byte V[AES_BLOCK_SIZE];
+    byte Z[AES_BLOCK_SIZE] ALIGNED_long;
+    byte V[AES_BLOCK_SIZE] ALIGNED_long;
     int i, j;
 
     XMEMSET(Z, 0, AES_BLOCK_SIZE);
@@ -62,7 +62,7 @@
         for (j = 0; j < 8; j++)
         {
             if (y & 0x80) {
-                xorbuf(Z, V, AES_BLOCK_SIZE);
+                xorbuf_aligned_AES_BLOCK_SIZE(Z, V);
             }
 
             RIGHTSHIFTX(V);
@@ -86,8 +86,8 @@
     byte* s //, unsigned sSz
 )
 {
-    byte x[AES_BLOCK_SIZE] ALIGNED(4);
-    byte scratch[AES_BLOCK_SIZE] ALIGNED(4);
+    byte x[AES_BLOCK_SIZE] ALIGNED_long;
+    byte scratch[AES_BLOCK_SIZE] ALIGNED_long;
     word32 blocks, partial;
     //was: byte* h = aes->H;
 
@@ -116,6 +116,7 @@
         blocks = cSz / AES_BLOCK_SIZE;
         partial = cSz % AES_BLOCK_SIZE;
         while (blocks--) {
+            //xorbuf_aligned_AES_BLOCK_SIZE(x, c); - c is not guaranteed to be aligned
             xorbuf(x, c, AES_BLOCK_SIZE);
             GMULT(x, h);
             c += AES_BLOCK_SIZE;
@@ -124,7 +125,7 @@
             //XMEMSET(scratch, 0, AES_BLOCK_SIZE);
             //XMEMCPY(scratch, c, partial);
             //xorbuf(x, scratch, AES_BLOCK_SIZE);
-            xorbuf(x, c, partial);
+            xorbuf(x, c, partial);//same result as above
             GMULT(x, h);
         }
     }
@@ -132,7 +133,7 @@
     /* Hash in the lengths of A and C in bits */
     FlattenSzInBits(&scratch[0], aSz);
     FlattenSzInBits(&scratch[8], cSz);
-    xorbuf(x, scratch, AES_BLOCK_SIZE);
+    xorbuf_aligned_AES_BLOCK_SIZE(x, scratch);
     GMULT(x, h);
 
     /* Copy the result into s. */