libbb/sha256: code shrink in 32-bit x86

function                                             old     new   delta
sha256_process_block64_shaNI                         722     713      -9

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
index 632dab7..417da37 100644
--- a/libbb/hash_md5_sha256_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-32_shaNI.S
@@ -31,7 +31,7 @@
 #define MSGTMP1		%xmm4
 #define MSGTMP2		%xmm5
 #define MSGTMP3		%xmm6
-#define MSGTMP4		%xmm7
+#define XMMTMP4		%xmm7
 
 	.balign	8	# allow decoders to fetch at least 3 first insns
 sha256_process_block64_shaNI:
@@ -45,10 +45,12 @@
 
 	shuf128_32	$0xB1, STATE0,  STATE0		/* CDAB */
 	shuf128_32	$0x1B, STATE1,  STATE1		/* EFGH */
-	mova128		STATE0, MSGTMP4
+	mova128		STATE0, XMMTMP4
 	palignr		$8, STATE1,  STATE0		/* ABEF */
-	pblendw		$0xF0, MSGTMP4, STATE1		/* CDGH */
+	pblendw		$0xF0, XMMTMP4, STATE1		/* CDGH */
 
+/* XMMTMP4 holds flip mask from here... */
+	mova128		PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP4
 	movl		$K256+8*16, SHA256CONSTANTS
 
 	/* Save hash values for addition after rounds */
@@ -57,7 +59,7 @@
 
 	/* Rounds 0-3 */
 	movu128		0*16(DATA_PTR), MSG
-	pshufb		PSHUFFLE_BSWAP32_FLIP_MASK, MSG
+	pshufb		XMMTMP4, MSG
 	mova128		MSG, MSGTMP0
 		paddd		0*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
@@ -66,7 +68,7 @@
 
 	/* Rounds 4-7 */
 	movu128		1*16(DATA_PTR), MSG
-	pshufb		PSHUFFLE_BSWAP32_FLIP_MASK, MSG
+	pshufb		XMMTMP4, MSG
 	mova128		MSG, MSGTMP1
 		paddd		1*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
@@ -76,7 +78,7 @@
 
 	/* Rounds 8-11 */
 	movu128		2*16(DATA_PTR), MSG
-	pshufb		PSHUFFLE_BSWAP32_FLIP_MASK, MSG
+	pshufb		XMMTMP4, MSG
 	mova128		MSG, MSGTMP2
 		paddd		2*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
@@ -86,13 +88,14 @@
 
 	/* Rounds 12-15 */
 	movu128		3*16(DATA_PTR), MSG
-	pshufb		PSHUFFLE_BSWAP32_FLIP_MASK, MSG
+	pshufb		XMMTMP4, MSG
+/* ...to here */
 	mova128		MSG, MSGTMP3
 		paddd		3*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP3, MSGTMP4
-	palignr		$4, MSGTMP2, MSGTMP4
-	paddd		MSGTMP4, MSGTMP0
+	mova128		MSGTMP3, XMMTMP4
+	palignr		$4, MSGTMP2, XMMTMP4
+	paddd		XMMTMP4, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -102,9 +105,9 @@
 	mova128		MSGTMP0, MSG
 		paddd		4*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP0, MSGTMP4
-	palignr		$4, MSGTMP3, MSGTMP4
-	paddd		MSGTMP4, MSGTMP1
+	mova128		MSGTMP0, XMMTMP4
+	palignr		$4, MSGTMP3, XMMTMP4
+	paddd		XMMTMP4, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -114,9 +117,9 @@
 	mova128		MSGTMP1, MSG
 		paddd		5*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP1, MSGTMP4
-	palignr		$4, MSGTMP0, MSGTMP4
-	paddd		MSGTMP4, MSGTMP2
+	mova128		MSGTMP1, XMMTMP4
+	palignr		$4, MSGTMP0, XMMTMP4
+	paddd		XMMTMP4, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -126,9 +129,9 @@
 	mova128		MSGTMP2, MSG
 		paddd		6*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP2, MSGTMP4
-	palignr		$4, MSGTMP1, MSGTMP4
-	paddd		MSGTMP4, MSGTMP3
+	mova128		MSGTMP2, XMMTMP4
+	palignr		$4, MSGTMP1, XMMTMP4
+	paddd		XMMTMP4, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -138,9 +141,9 @@
 	mova128		MSGTMP3, MSG
 		paddd		7*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP3, MSGTMP4
-	palignr		$4, MSGTMP2, MSGTMP4
-	paddd		MSGTMP4, MSGTMP0
+	mova128		MSGTMP3, XMMTMP4
+	palignr		$4, MSGTMP2, XMMTMP4
+	paddd		XMMTMP4, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -150,9 +153,9 @@
 	mova128		MSGTMP0, MSG
 		paddd		8*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP0, MSGTMP4
-	palignr		$4, MSGTMP3, MSGTMP4
-	paddd		MSGTMP4, MSGTMP1
+	mova128		MSGTMP0, XMMTMP4
+	palignr		$4, MSGTMP3, XMMTMP4
+	paddd		XMMTMP4, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -162,9 +165,9 @@
 	mova128		MSGTMP1, MSG
 		paddd		9*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP1, MSGTMP4
-	palignr		$4, MSGTMP0, MSGTMP4
-	paddd		MSGTMP4, MSGTMP2
+	mova128		MSGTMP1, XMMTMP4
+	palignr		$4, MSGTMP0, XMMTMP4
+	paddd		XMMTMP4, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -174,9 +177,9 @@
 	mova128		MSGTMP2, MSG
 		paddd		10*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP2, MSGTMP4
-	palignr		$4, MSGTMP1, MSGTMP4
-	paddd		MSGTMP4, MSGTMP3
+	mova128		MSGTMP2, XMMTMP4
+	palignr		$4, MSGTMP1, XMMTMP4
+	paddd		XMMTMP4, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -186,9 +189,9 @@
 	mova128		MSGTMP3, MSG
 		paddd		11*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP3, MSGTMP4
-	palignr		$4, MSGTMP2, MSGTMP4
-	paddd		MSGTMP4, MSGTMP0
+	mova128		MSGTMP3, XMMTMP4
+	palignr		$4, MSGTMP2, XMMTMP4
+	paddd		XMMTMP4, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -198,9 +201,9 @@
 	mova128		MSGTMP0, MSG
 		paddd		12*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP0, MSGTMP4
-	palignr		$4, MSGTMP3, MSGTMP4
-	paddd		MSGTMP4, MSGTMP1
+	mova128		MSGTMP0, XMMTMP4
+	palignr		$4, MSGTMP3, XMMTMP4
+	paddd		XMMTMP4, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -210,9 +213,9 @@
 	mova128		MSGTMP1, MSG
 		paddd		13*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP1, MSGTMP4
-	palignr		$4, MSGTMP0, MSGTMP4
-	paddd		MSGTMP4, MSGTMP2
+	mova128		MSGTMP1, XMMTMP4
+	palignr		$4, MSGTMP0, XMMTMP4
+	paddd		XMMTMP4, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -221,9 +224,9 @@
 	mova128		MSGTMP2, MSG
 		paddd		14*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP2, MSGTMP4
-	palignr		$4, MSGTMP1, MSGTMP4
-	paddd		MSGTMP4, MSGTMP3
+	mova128		MSGTMP2, XMMTMP4
+	palignr		$4, MSGTMP1, XMMTMP4
+	paddd		XMMTMP4, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -242,9 +245,9 @@
 	/* Write hash values back in the correct order */
 	shuf128_32	$0x1B, STATE0,  STATE0		/* FEBA */
 	shuf128_32	$0xB1, STATE1,  STATE1		/* DCHG */
-	mova128		STATE0, MSGTMP4
+	mova128		STATE0, XMMTMP4
 	pblendw		$0xF0, STATE1,  STATE0		/* DCBA */
-	palignr		$8, MSGTMP4, STATE1		/* HGFE */
+	palignr		$8, XMMTMP4, STATE1		/* HGFE */
 
 	movu128		STATE0, 76+0*16(%eax)
 	movu128		STATE1, 76+1*16(%eax)