libbb/sha1: shrink x86 hardware accelerated hashing

function                                             old     new   delta
sha1_process_block64_shaNI 32-bit                    524     517      -7
sha1_process_block64_shaNI 64-bit                    510     508      -2

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S
index 5d082eb..0f3fe57 100644
--- a/libbb/hash_md5_sha_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha_x86-32_shaNI.S
@@ -32,14 +32,10 @@
 #define MSG1		%xmm4
 #define MSG2		%xmm5
 #define MSG3		%xmm6
-#define SHUF_MASK	%xmm7
 
-	.balign	8	# allow decoders to fetch at least 3 first insns
+	.balign	8	# allow decoders to fetch at least 2 first insns
 sha1_process_block64_shaNI:
-	pushl		%ebp
-	movl		%esp, %ebp
-	subl		$32, %esp
-	andl		$~0xF, %esp	# paddd needs aligned memory operand
+	subl		$16, %esp
 
 	/* load initial hash values */
 	xor128		E0, E0
@@ -47,30 +43,33 @@
 	pinsrd		$3, 76+4*4(%eax), E0	# load to uppermost 32-bit word
 	shuf128_32	$0x1B, ABCD, ABCD	# DCBA -> ABCD
 
-	mova128		PSHUFFLE_BYTE_FLIP_MASK, SHUF_MASK
+	mova128		PSHUFFLE_BYTE_FLIP_MASK, %xmm7
+
+	movu128		0*16(%eax), MSG0
+	pshufb		%xmm7, MSG0
+	movu128		1*16(%eax), MSG1
+	pshufb		%xmm7, MSG1
+	movu128		2*16(%eax), MSG2
+	pshufb		%xmm7, MSG2
+	movu128		3*16(%eax), MSG3
+	pshufb		%xmm7, MSG3
 
 	/* Save hash values for addition after rounds */
-	movu128		E0, 16(%esp)
+	movu128		E0, %xmm7
 	movu128		ABCD, (%esp)
 
 	/* Rounds 0-3 */
-	movu128		0*16(%eax), MSG0
-	pshufb		SHUF_MASK, MSG0
 		paddd		MSG0, E0
 		mova128		ABCD, E1
 		sha1rnds4	$0, E0, ABCD
 
 	/* Rounds 4-7 */
-	movu128		1*16(%eax), MSG1
-	pshufb		SHUF_MASK, MSG1
 		sha1nexte	MSG1, E1
 		mova128		ABCD, E0
 		sha1rnds4	$0, E1, ABCD
 	sha1msg1	MSG1, MSG0
 
 	/* Rounds 8-11 */
-	movu128		2*16(%eax), MSG2
-	pshufb		SHUF_MASK, MSG2
 		sha1nexte	MSG2, E0
 		mova128		ABCD, E1
 		sha1rnds4	$0, E0, ABCD
@@ -78,8 +77,6 @@
 	xor128		MSG2, MSG0
 
 	/* Rounds 12-15 */
-	movu128		3*16(%eax), MSG3
-	pshufb		SHUF_MASK, MSG3
 		sha1nexte	MSG3, E1
 		mova128		ABCD, E0
 	sha1msg2	MSG3, MSG0
@@ -210,16 +207,16 @@
 		sha1rnds4	$3, E1, ABCD
 
 	/* Add current hash values with previously saved */
-	sha1nexte	16(%esp), E0
-	paddd		(%esp), ABCD
+	sha1nexte	%xmm7, E0
+	movu128		(%esp), %xmm7
+	paddd		%xmm7, ABCD
 
 	/* Write hash values back in the correct order */
 	shuf128_32	$0x1B, ABCD, ABCD
 	movu128		ABCD, 76(%eax)
 	extr128_32	$3, E0, 76+4*4(%eax)
 
-	movl	%ebp, %esp
-	popl	%ebp
+	addl		$16, %esp
 	ret
 	.size	sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
 
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S
index 8ddec87..fc2ca92 100644
--- a/libbb/hash_md5_sha_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha_x86-64_shaNI.S
@@ -32,7 +32,6 @@
 #define MSG1		%xmm4
 #define MSG2		%xmm5
 #define MSG3		%xmm6
-#define SHUF_MASK	%xmm7
 
 	.balign	8	# allow decoders to fetch at least 2 first insns
 sha1_process_block64_shaNI:
@@ -43,30 +42,33 @@
 	pinsrd		$3, 80+4*4(%rdi), E0	# load to uppermost 32-bit word
 	shuf128_32	$0x1B, ABCD, ABCD	# DCBA -> ABCD
 
-	mova128		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+	mova128		PSHUFFLE_BYTE_FLIP_MASK(%rip), %xmm7
+
+	movu128		0*16(%rdi), MSG0
+	pshufb		%xmm7, MSG0
+	movu128		1*16(%rdi), MSG1
+	pshufb		%xmm7, MSG1
+	movu128		2*16(%rdi), MSG2
+	pshufb		%xmm7, MSG2
+	movu128		3*16(%rdi), MSG3
+	pshufb		%xmm7, MSG3
 
 	/* Save hash values for addition after rounds */
-	mova128		E0, %xmm9
+	mova128		E0, %xmm7
 	mova128		ABCD, %xmm8
 
 	/* Rounds 0-3 */
-	movu128		0*16(%rdi), MSG0
-	pshufb		SHUF_MASK, MSG0
 		paddd		MSG0, E0
 		mova128		ABCD, E1
 		sha1rnds4	$0, E0, ABCD
 
 	/* Rounds 4-7 */
-	movu128		1*16(%rdi), MSG1
-	pshufb		SHUF_MASK, MSG1
 		sha1nexte	MSG1, E1
 		mova128		ABCD, E0
 		sha1rnds4	$0, E1, ABCD
 	sha1msg1	MSG1, MSG0
 
 	/* Rounds 8-11 */
-	movu128		2*16(%rdi), MSG2
-	pshufb		SHUF_MASK, MSG2
 		sha1nexte	MSG2, E0
 		mova128		ABCD, E1
 		sha1rnds4	$0, E0, ABCD
@@ -74,8 +76,6 @@
 	xor128		MSG2, MSG0
 
 	/* Rounds 12-15 */
-	movu128		3*16(%rdi), MSG3
-	pshufb		SHUF_MASK, MSG3
 		sha1nexte	MSG3, E1
 		mova128		ABCD, E0
 	sha1msg2	MSG3, MSG0
@@ -206,7 +206,7 @@
 		sha1rnds4	$3, E1, ABCD
 
 	/* Add current hash values with previously saved */
-	sha1nexte	%xmm9, E0
+	sha1nexte	%xmm7, E0
 	paddd		%xmm8, ABCD
 
 	/* Write hash values back in the correct order */