libbb/sha1: shrink x86 hardware accelerated hashing
function old new delta
sha1_process_block64_shaNI 32-bit 524 517 -7
sha1_process_block64_shaNI 64-bit 510 508 -2
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S
index 5d082eb..0f3fe57 100644
--- a/libbb/hash_md5_sha_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha_x86-32_shaNI.S
@@ -32,14 +32,10 @@
#define MSG1 %xmm4
#define MSG2 %xmm5
#define MSG3 %xmm6
-#define SHUF_MASK %xmm7
- .balign 8 # allow decoders to fetch at least 3 first insns
+ .balign 8 # allow decoders to fetch at least 2 first insns
sha1_process_block64_shaNI:
- pushl %ebp
- movl %esp, %ebp
- subl $32, %esp
- andl $~0xF, %esp # paddd needs aligned memory operand
+ subl $16, %esp
/* load initial hash values */
xor128 E0, E0
@@ -47,30 +43,33 @@
pinsrd $3, 76+4*4(%eax), E0 # load to uppermost 32-bit word
shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD
- mova128 PSHUFFLE_BYTE_FLIP_MASK, SHUF_MASK
+ mova128 PSHUFFLE_BYTE_FLIP_MASK, %xmm7
+
+ movu128 0*16(%eax), MSG0
+ pshufb %xmm7, MSG0
+ movu128 1*16(%eax), MSG1
+ pshufb %xmm7, MSG1
+ movu128 2*16(%eax), MSG2
+ pshufb %xmm7, MSG2
+ movu128 3*16(%eax), MSG3
+ pshufb %xmm7, MSG3
/* Save hash values for addition after rounds */
- movu128 E0, 16(%esp)
+ movu128 E0, %xmm7
movu128 ABCD, (%esp)
/* Rounds 0-3 */
- movu128 0*16(%eax), MSG0
- pshufb SHUF_MASK, MSG0
paddd MSG0, E0
mova128 ABCD, E1
sha1rnds4 $0, E0, ABCD
/* Rounds 4-7 */
- movu128 1*16(%eax), MSG1
- pshufb SHUF_MASK, MSG1
sha1nexte MSG1, E1
mova128 ABCD, E0
sha1rnds4 $0, E1, ABCD
sha1msg1 MSG1, MSG0
/* Rounds 8-11 */
- movu128 2*16(%eax), MSG2
- pshufb SHUF_MASK, MSG2
sha1nexte MSG2, E0
mova128 ABCD, E1
sha1rnds4 $0, E0, ABCD
@@ -78,8 +77,6 @@
xor128 MSG2, MSG0
/* Rounds 12-15 */
- movu128 3*16(%eax), MSG3
- pshufb SHUF_MASK, MSG3
sha1nexte MSG3, E1
mova128 ABCD, E0
sha1msg2 MSG3, MSG0
@@ -210,16 +207,16 @@
sha1rnds4 $3, E1, ABCD
/* Add current hash values with previously saved */
- sha1nexte 16(%esp), E0
- paddd (%esp), ABCD
+ sha1nexte %xmm7, E0
+ movu128 (%esp), %xmm7
+ paddd %xmm7, ABCD
/* Write hash values back in the correct order */
shuf128_32 $0x1B, ABCD, ABCD
movu128 ABCD, 76(%eax)
extr128_32 $3, E0, 76+4*4(%eax)
- movl %ebp, %esp
- popl %ebp
+ addl $16, %esp
ret
.size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S
index 8ddec87..fc2ca92 100644
--- a/libbb/hash_md5_sha_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha_x86-64_shaNI.S
@@ -32,7 +32,6 @@
#define MSG1 %xmm4
#define MSG2 %xmm5
#define MSG3 %xmm6
-#define SHUF_MASK %xmm7
.balign 8 # allow decoders to fetch at least 2 first insns
sha1_process_block64_shaNI:
@@ -43,30 +42,33 @@
pinsrd $3, 80+4*4(%rdi), E0 # load to uppermost 32-bit word
shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD
- mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+ mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), %xmm7
+
+ movu128 0*16(%rdi), MSG0
+ pshufb %xmm7, MSG0
+ movu128 1*16(%rdi), MSG1
+ pshufb %xmm7, MSG1
+ movu128 2*16(%rdi), MSG2
+ pshufb %xmm7, MSG2
+ movu128 3*16(%rdi), MSG3
+ pshufb %xmm7, MSG3
/* Save hash values for addition after rounds */
- mova128 E0, %xmm9
+ mova128 E0, %xmm7
mova128 ABCD, %xmm8
/* Rounds 0-3 */
- movu128 0*16(%rdi), MSG0
- pshufb SHUF_MASK, MSG0
paddd MSG0, E0
mova128 ABCD, E1
sha1rnds4 $0, E0, ABCD
/* Rounds 4-7 */
- movu128 1*16(%rdi), MSG1
- pshufb SHUF_MASK, MSG1
sha1nexte MSG1, E1
mova128 ABCD, E0
sha1rnds4 $0, E1, ABCD
sha1msg1 MSG1, MSG0
/* Rounds 8-11 */
- movu128 2*16(%rdi), MSG2
- pshufb SHUF_MASK, MSG2
sha1nexte MSG2, E0
mova128 ABCD, E1
sha1rnds4 $0, E0, ABCD
@@ -74,8 +76,6 @@
xor128 MSG2, MSG0
/* Rounds 12-15 */
- movu128 3*16(%rdi), MSG3
- pshufb SHUF_MASK, MSG3
sha1nexte MSG3, E1
mova128 ABCD, E0
sha1msg2 MSG3, MSG0
@@ -206,7 +206,7 @@
sha1rnds4 $3, E1, ABCD
/* Add current hash values with previously saved */
- sha1nexte %xmm9, E0
+ sha1nexte %xmm7, E0
paddd %xmm8, ABCD
/* Write hash values back in the correct order */