libbb/sha256: code shrink in 32-bit x86
function old new delta
sha256_process_block64_shaNI 713 697 -16
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
index 39e2baf..a849dfc 100644
--- a/libbb/hash_md5_sha256_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-32_shaNI.S
@@ -31,35 +31,27 @@
#define MSGTMP1 %xmm4
#define MSGTMP2 %xmm5
#define MSGTMP3 %xmm6
-#define XMMTMP4 %xmm7
- .balign 8 # allow decoders to fetch at least 3 first insns
+#define XMMTMP %xmm7
+
+ .balign 8 # allow decoders to fetch at least 2 first insns
sha256_process_block64_shaNI:
- pushl %ebp
- movl %esp, %ebp
- subl $32, %esp
- andl $~0xF, %esp # paddd needs aligned memory operand
-
movu128 76+0*16(%eax), STATE0
movu128 76+1*16(%eax), STATE1
- shuf128_32 $0xB1, STATE0, STATE0 /* CDAB */
- shuf128_32 $0x1B, STATE1, STATE1 /* EFGH */
- mova128 STATE0, XMMTMP4
- palignr $8, STATE1, STATE0 /* ABEF */
- pblendw $0xF0, XMMTMP4, STATE1 /* CDGH */
+ shuf128_32 $0xB1, STATE0, STATE0 /* CDAB */
+ shuf128_32 $0x1B, STATE1, STATE1 /* EFGH */
+ mova128 STATE0, XMMTMP
+ palignr $8, STATE1, STATE0 /* ABEF */
+ pblendw $0xF0, XMMTMP, STATE1 /* CDGH */
-/* XMMTMP4 holds flip mask from here... */
- mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP4
+/* XMMTMP holds flip mask from here... */
+ mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP
movl $K256+8*16, SHA256CONSTANTS
- /* Save hash values for addition after rounds */
- mova128 STATE0, 0*16(%esp)
- mova128 STATE1, 1*16(%esp)
-
/* Rounds 0-3 */
movu128 0*16(DATA_PTR), MSG
- pshufb XMMTMP4, MSG
+ pshufb XMMTMP, MSG
mova128 MSG, MSGTMP0
paddd 0*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
@@ -68,7 +60,7 @@
/* Rounds 4-7 */
movu128 1*16(DATA_PTR), MSG
- pshufb XMMTMP4, MSG
+ pshufb XMMTMP, MSG
mova128 MSG, MSGTMP1
paddd 1*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
@@ -78,7 +70,7 @@
/* Rounds 8-11 */
movu128 2*16(DATA_PTR), MSG
- pshufb XMMTMP4, MSG
+ pshufb XMMTMP, MSG
mova128 MSG, MSGTMP2
paddd 2*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
@@ -88,14 +80,14 @@
/* Rounds 12-15 */
movu128 3*16(DATA_PTR), MSG
- pshufb XMMTMP4, MSG
+ pshufb XMMTMP, MSG
/* ...to here */
mova128 MSG, MSGTMP3
paddd 3*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
- mova128 MSGTMP3, XMMTMP4
- palignr $4, MSGTMP2, XMMTMP4
- paddd XMMTMP4, MSGTMP0
+ mova128 MSGTMP3, XMMTMP
+ palignr $4, MSGTMP2, XMMTMP
+ paddd XMMTMP, MSGTMP0
sha256msg2 MSGTMP3, MSGTMP0
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
@@ -105,9 +97,9 @@
mova128 MSGTMP0, MSG
paddd 4*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
- mova128 MSGTMP0, XMMTMP4
- palignr $4, MSGTMP3, XMMTMP4
- paddd XMMTMP4, MSGTMP1
+ mova128 MSGTMP0, XMMTMP
+ palignr $4, MSGTMP3, XMMTMP
+ paddd XMMTMP, MSGTMP1
sha256msg2 MSGTMP0, MSGTMP1
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
@@ -117,9 +109,9 @@
mova128 MSGTMP1, MSG
paddd 5*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
- mova128 MSGTMP1, XMMTMP4
- palignr $4, MSGTMP0, XMMTMP4
- paddd XMMTMP4, MSGTMP2
+ mova128 MSGTMP1, XMMTMP
+ palignr $4, MSGTMP0, XMMTMP
+ paddd XMMTMP, MSGTMP2
sha256msg2 MSGTMP1, MSGTMP2
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
@@ -129,9 +121,9 @@
mova128 MSGTMP2, MSG
paddd 6*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
- mova128 MSGTMP2, XMMTMP4
- palignr $4, MSGTMP1, XMMTMP4
- paddd XMMTMP4, MSGTMP3
+ mova128 MSGTMP2, XMMTMP
+ palignr $4, MSGTMP1, XMMTMP
+ paddd XMMTMP, MSGTMP3
sha256msg2 MSGTMP2, MSGTMP3
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
@@ -141,9 +133,9 @@
mova128 MSGTMP3, MSG
paddd 7*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
- mova128 MSGTMP3, XMMTMP4
- palignr $4, MSGTMP2, XMMTMP4
- paddd XMMTMP4, MSGTMP0
+ mova128 MSGTMP3, XMMTMP
+ palignr $4, MSGTMP2, XMMTMP
+ paddd XMMTMP, MSGTMP0
sha256msg2 MSGTMP3, MSGTMP0
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
@@ -153,9 +145,9 @@
mova128 MSGTMP0, MSG
paddd 8*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
- mova128 MSGTMP0, XMMTMP4
- palignr $4, MSGTMP3, XMMTMP4
- paddd XMMTMP4, MSGTMP1
+ mova128 MSGTMP0, XMMTMP
+ palignr $4, MSGTMP3, XMMTMP
+ paddd XMMTMP, MSGTMP1
sha256msg2 MSGTMP0, MSGTMP1
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
@@ -165,9 +157,9 @@
mova128 MSGTMP1, MSG
paddd 9*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
- mova128 MSGTMP1, XMMTMP4
- palignr $4, MSGTMP0, XMMTMP4
- paddd XMMTMP4, MSGTMP2
+ mova128 MSGTMP1, XMMTMP
+ palignr $4, MSGTMP0, XMMTMP
+ paddd XMMTMP, MSGTMP2
sha256msg2 MSGTMP1, MSGTMP2
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
@@ -177,9 +169,9 @@
mova128 MSGTMP2, MSG
paddd 10*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
- mova128 MSGTMP2, XMMTMP4
- palignr $4, MSGTMP1, XMMTMP4
- paddd XMMTMP4, MSGTMP3
+ mova128 MSGTMP2, XMMTMP
+ palignr $4, MSGTMP1, XMMTMP
+ paddd XMMTMP, MSGTMP3
sha256msg2 MSGTMP2, MSGTMP3
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
@@ -189,9 +181,9 @@
mova128 MSGTMP3, MSG
paddd 11*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
- mova128 MSGTMP3, XMMTMP4
- palignr $4, MSGTMP2, XMMTMP4
- paddd XMMTMP4, MSGTMP0
+ mova128 MSGTMP3, XMMTMP
+ palignr $4, MSGTMP2, XMMTMP
+ paddd XMMTMP, MSGTMP0
sha256msg2 MSGTMP3, MSGTMP0
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
@@ -201,9 +193,9 @@
mova128 MSGTMP0, MSG
paddd 12*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
- mova128 MSGTMP0, XMMTMP4
- palignr $4, MSGTMP3, XMMTMP4
- paddd XMMTMP4, MSGTMP1
+ mova128 MSGTMP0, XMMTMP
+ palignr $4, MSGTMP3, XMMTMP
+ paddd XMMTMP, MSGTMP1
sha256msg2 MSGTMP0, MSGTMP1
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
@@ -213,9 +205,9 @@
mova128 MSGTMP1, MSG
paddd 13*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
- mova128 MSGTMP1, XMMTMP4
- palignr $4, MSGTMP0, XMMTMP4
- paddd XMMTMP4, MSGTMP2
+ mova128 MSGTMP1, XMMTMP
+ palignr $4, MSGTMP0, XMMTMP
+ paddd XMMTMP, MSGTMP2
sha256msg2 MSGTMP1, MSGTMP2
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
@@ -224,9 +216,9 @@
mova128 MSGTMP2, MSG
paddd 14*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
- mova128 MSGTMP2, XMMTMP4
- palignr $4, MSGTMP1, XMMTMP4
- paddd XMMTMP4, MSGTMP3
+ mova128 MSGTMP2, XMMTMP
+ palignr $4, MSGTMP1, XMMTMP
+ paddd XMMTMP, MSGTMP3
sha256msg2 MSGTMP2, MSGTMP3
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
@@ -238,22 +230,20 @@
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
- /* Add current hash values with previously saved */
- paddd 0*16(%esp), STATE0
- paddd 1*16(%esp), STATE1
-
/* Write hash values back in the correct order */
- shuf128_32 $0x1B, STATE0, STATE0 /* FEBA */
- shuf128_32 $0xB1, STATE1, STATE1 /* DCHG */
- mova128 STATE0, XMMTMP4
- pblendw $0xF0, STATE1, STATE0 /* DCBA */
- palignr $8, XMMTMP4, STATE1 /* HGFE */
-
+ shuf128_32 $0x1B, STATE0, STATE0 /* FEBA */
+ shuf128_32 $0xB1, STATE1, STATE1 /* DCHG */
+ mova128 STATE0, XMMTMP
+ pblendw $0xF0, STATE1, STATE0 /* DCBA */
+ palignr $8, XMMTMP, STATE1 /* HGFE */
+ /* add current hash values to previous ones */
+ movu128 76+0*16(%eax), XMMTMP
+ paddd XMMTMP, STATE0
+ movu128 76+1*16(%eax), XMMTMP
movu128 STATE0, 76+0*16(%eax)
+ paddd XMMTMP, STATE1
movu128 STATE1, 76+1*16(%eax)
- movl %ebp, %esp
- popl %ebp
ret
.size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI