Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 1 | #!/bin/sh |
| 2 | |
| 3 | # We don't regenerate it on every "make" invocation - only by hand. |
| 4 | # The reason is that the changes to generated code are difficult |
| 5 | # to visualize by looking only at this script, it helps when the commit |
| 6 | # also contains the diff of the generated file. |
| 7 | exec >hash_md5_sha_x86-64.S |
| 8 | |
| 9 | echo \ |
| 10 | '### Generated by hash_md5_sha_x86-64.S.sh ### |
| 11 | |
| 12 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) |
| 13 | .section .text.sha1_process_block64,"ax",@progbits |
| 14 | .globl sha1_process_block64 |
| 15 | .hidden sha1_process_block64 |
| 16 | .type sha1_process_block64, @function |
| 17 | |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame^] | 18 | .balign 8 # allow decoders to fetch at least 5 first insns |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 19 | sha1_process_block64: |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame^] | 20 | pushq %rbp # 1 byte insn |
| 21 | pushq %rbx # 1 byte insn |
| 22 | pushq %r15 # 2 byte insn |
| 23 | pushq %r14 # 2 byte insn |
| 24 | pushq %r13 # 2 byte insn |
| 25 | pushq %r12 # 2 byte insn |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 26 | pushq %rdi # we need ctx at the end |
| 27 | |
| 28 | #Register and stack use: |
| 29 | # eax..edx: a..d |
| 30 | # ebp: e |
| 31 | # esi,edi: temps |
| 32 | # -32+4*n(%rsp),r8...r15: W[0..7,8..15] |
Denys Vlasenko | 4387077 | 2022-01-03 13:14:09 +0100 | [diff] [blame] | 33 | # (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 34 | movl $3, %eax |
| 35 | 1: |
| 36 | movq (%rdi,%rax,8), %rsi |
| 37 | bswapq %rsi |
| 38 | rolq $32, %rsi |
| 39 | movq %rsi, -32(%rsp,%rax,8) |
| 40 | decl %eax |
| 41 | jns 1b |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame^] | 42 | |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 43 | movl 80(%rdi), %eax # a = ctx->hash[0] |
| 44 | movl 84(%rdi), %ebx # b = ctx->hash[1] |
| 45 | movl 88(%rdi), %ecx # c = ctx->hash[2] |
| 46 | movl 92(%rdi), %edx # d = ctx->hash[3] |
| 47 | movl 96(%rdi), %ebp # e = ctx->hash[4] |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame^] | 48 | |
| 49 | movq 4*8(%rdi), %r8 |
| 50 | movq 4*10(%rdi), %r10 |
| 51 | bswapq %r8 |
| 52 | bswapq %r10 |
| 53 | movq 4*12(%rdi), %r12 |
| 54 | movq 4*14(%rdi), %r14 |
| 55 | bswapq %r12 |
| 56 | bswapq %r14 |
| 57 | movl %r8d, %r9d |
| 58 | shrq $32, %r8 |
| 59 | movl %r10d, %r11d |
| 60 | shrq $32, %r10 |
| 61 | movl %r12d, %r13d |
| 62 | shrq $32, %r12 |
| 63 | movl %r14d, %r15d |
| 64 | shrq $32, %r14 |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 65 | ' |
| 66 | W32() { |
| 67 | test "$1" || exit 1 |
| 68 | test "$1" -lt 0 && exit 1 |
| 69 | test "$1" -gt 15 && exit 1 |
| 70 | test "$1" -lt 8 && echo "-32+4*$1(%rsp)" |
| 71 | test "$1" -ge 8 && echo "%r${1}d" |
| 72 | } |
| 73 | |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame^] | 74 | # It's possible to interleave insns in rounds to mostly eliminate |
| 75 | # dependency chains, but this likely to only help old Pentium-based |
| 76 | # CPUs (ones without OOO, which can only simultaneously execute a pair |
| 77 | # of _adjacent_ insns). |
| 78 | # Testing on old-ish Silvermont CPU (which has OOO window of only |
| 79 | # about ~8 insns) shows very small (~1%) speedup. |
| 80 | |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 81 | RD1A() { |
| 82 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 |
| 83 | local n=$(($6)) |
Denys Vlasenko | 7abb2bb | 2022-01-03 17:02:48 +0100 | [diff] [blame] | 84 | local n0=$(((n+0) & 15)) |
| 85 | echo " |
| 86 | # $n |
| 87 | ";test $n0 = 0 && echo " |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 88 | # W[0], already in %esi |
Denys Vlasenko | 7abb2bb | 2022-01-03 17:02:48 +0100 | [diff] [blame] | 89 | ";test $n0 != 0 && test $n0 -lt 8 && echo " |
| 90 | movl `W32 $n0`, %esi # W[n] |
| 91 | ";test $n0 -ge 8 && echo " |
| 92 | # W[n], in %r$n0 |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 93 | ";echo " |
| 94 | movl %e$c, %edi # c |
| 95 | xorl %e$d, %edi # ^d |
| 96 | andl %e$b, %edi # &b |
| 97 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 7abb2bb | 2022-01-03 17:02:48 +0100 | [diff] [blame] | 98 | ";test $n0 -lt 8 && echo " |
| 99 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] |
| 100 | ";test $n0 -ge 8 && echo " |
| 101 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n] |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 102 | ";echo " |
| 103 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) |
| 104 | movl %e$a, %esi # |
| 105 | roll \$5, %esi # rotl32(a,5) |
| 106 | addl %esi, %e$e # e += rotl32(a,5) |
| 107 | rorl \$2, %e$b # b = rotl32(b,30) |
| 108 | " |
| 109 | } |
| 110 | RD1B() { |
| 111 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 |
| 112 | local n=$(($6)) |
| 113 | local n13=$(((n+13) & 15)) |
| 114 | local n8=$(((n+8) & 15)) |
| 115 | local n2=$(((n+2) & 15)) |
| 116 | local n0=$(((n+0) & 15)) |
| 117 | echo " |
| 118 | # $n |
| 119 | ";test $n0 -lt 8 && echo " |
| 120 | movl `W32 $n13`, %esi # W[(n+13) & 15] |
| 121 | xorl `W32 $n8`, %esi # ^W[(n+8) & 15] |
| 122 | xorl `W32 $n2`, %esi # ^W[(n+2) & 15] |
| 123 | xorl `W32 $n0`, %esi # ^W[n & 15] |
| 124 | roll %esi # |
| 125 | movl %esi, `W32 $n0` # store to W[n & 15] |
| 126 | ";test $n0 -ge 8 && echo " |
| 127 | xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15] |
| 128 | xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] |
| 129 | xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] |
| 130 | roll `W32 $n0` # |
Denys Vlasenko | 7abb2bb | 2022-01-03 17:02:48 +0100 | [diff] [blame] | 131 | ";echo " |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 132 | movl %e$c, %edi # c |
| 133 | xorl %e$d, %edi # ^d |
| 134 | andl %e$b, %edi # &b |
| 135 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) |
| 136 | ";test $n0 -lt 8 && echo " |
| 137 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] |
| 138 | ";test $n0 -ge 8 && echo " |
| 139 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15] |
| 140 | ";echo " |
| 141 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) |
| 142 | movl %e$a, %esi # |
| 143 | roll \$5, %esi # rotl32(a,5) |
| 144 | addl %esi, %e$e # e += rotl32(a,5) |
| 145 | rorl \$2, %e$b # b = rotl32(b,30) |
| 146 | " |
| 147 | } |
| 148 | { |
| 149 | RCONST=0x5A827999 |
| 150 | RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3; RD1A bx cx dx bp ax 4 |
| 151 | RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7; RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9 |
| 152 | RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11; RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14 |
| 153 | RD1A ax bx cx dx bp 15; RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19 |
| 154 | } | grep -v '^$' |
| 155 | |
| 156 | RD2() { |
| 157 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 |
| 158 | local n=$(($6)) |
| 159 | local n13=$(((n+13) & 15)) |
| 160 | local n8=$(((n+8) & 15)) |
| 161 | local n2=$(((n+2) & 15)) |
| 162 | local n0=$(((n+0) & 15)) |
| 163 | echo " |
| 164 | # $n |
| 165 | ";test $n0 -lt 8 && echo " |
| 166 | movl `W32 $n13`, %esi # W[(n+13) & 15] |
| 167 | xorl `W32 $n8`, %esi # ^W[(n+8) & 15] |
| 168 | xorl `W32 $n2`, %esi # ^W[(n+2) & 15] |
| 169 | xorl `W32 $n0`, %esi # ^W[n & 15] |
| 170 | roll %esi # |
| 171 | movl %esi, `W32 $n0` # store to W[n & 15] |
| 172 | ";test $n0 -ge 8 && echo " |
| 173 | xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15] |
| 174 | xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] |
| 175 | xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] |
| 176 | roll `W32 $n0` # |
Denys Vlasenko | 7abb2bb | 2022-01-03 17:02:48 +0100 | [diff] [blame] | 177 | ";echo " |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 178 | movl %e$c, %edi # c |
| 179 | xorl %e$d, %edi # ^d |
| 180 | xorl %e$b, %edi # ^b |
| 181 | ";test $n0 -lt 8 && echo " |
| 182 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] |
| 183 | ";test $n0 -ge 8 && echo " |
| 184 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15] |
| 185 | ";echo " |
| 186 | addl %edi, %e$e # e += (c ^ d ^ b) |
| 187 | movl %e$a, %esi # |
| 188 | roll \$5, %esi # rotl32(a,5) |
| 189 | addl %esi, %e$e # e += rotl32(a,5) |
| 190 | rorl \$2, %e$b # b = rotl32(b,30) |
| 191 | " |
| 192 | } |
| 193 | { |
| 194 | RCONST=0x6ED9EBA1 |
| 195 | RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23; RD2 bx cx dx bp ax 24 |
| 196 | RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27; RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29 |
| 197 | RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31; RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34 |
| 198 | RD2 ax bx cx dx bp 35; RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39 |
| 199 | } | grep -v '^$' |
| 200 | |
| 201 | RD3() { |
| 202 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 |
| 203 | local n=$(($6)) |
| 204 | local n13=$(((n+13) & 15)) |
| 205 | local n8=$(((n+8) & 15)) |
| 206 | local n2=$(((n+2) & 15)) |
| 207 | local n0=$(((n+0) & 15)) |
| 208 | echo " |
| 209 | # $n |
| 210 | movl %e$b, %edi # di: b |
| 211 | movl %e$b, %esi # si: b |
| 212 | orl %e$c, %edi # di: b | c |
| 213 | andl %e$c, %esi # si: b & c |
| 214 | andl %e$d, %edi # di: (b | c) & d |
| 215 | orl %esi, %edi # ((b | c) & d) | (b & c) |
| 216 | ";test $n0 -lt 8 && echo " |
| 217 | movl `W32 $n13`, %esi # W[(n+13) & 15] |
| 218 | xorl `W32 $n8`, %esi # ^W[(n+8) & 15] |
| 219 | xorl `W32 $n2`, %esi # ^W[(n+2) & 15] |
| 220 | xorl `W32 $n0`, %esi # ^W[n & 15] |
| 221 | roll %esi # |
| 222 | movl %esi, `W32 $n0` # store to W[n & 15] |
| 223 | ";test $n0 -ge 8 && echo " |
| 224 | xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15] |
| 225 | xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] |
| 226 | xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] |
| 227 | roll `W32 $n0` # |
Denys Vlasenko | 7abb2bb | 2022-01-03 17:02:48 +0100 | [diff] [blame] | 228 | ";echo " |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 229 | addl %edi, %e$e # += ((b | c) & d) | (b & c) |
| 230 | ";test $n0 -lt 8 && echo " |
| 231 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] |
| 232 | ";test $n0 -ge 8 && echo " |
| 233 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15] |
| 234 | ";echo " |
| 235 | movl %e$a, %esi # |
| 236 | roll \$5, %esi # rotl32(a,5) |
| 237 | addl %esi, %e$e # e += rotl32(a,5) |
| 238 | rorl \$2, %e$b # b = rotl32(b,30) |
| 239 | " |
| 240 | } |
| 241 | { |
| 242 | #RCONST=0x8F1BBCDC "out of range for signed 32bit displacement" |
| 243 | RCONST=-0x70E44324 |
| 244 | RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43; RD3 bx cx dx bp ax 44 |
| 245 | RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47; RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49 |
| 246 | RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51; RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54 |
| 247 | RD3 ax bx cx dx bp 55; RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59 |
| 248 | } | grep -v '^$' |
| 249 | |
| 250 | # Round 4 has the same logic as round 2, only n and RCONST are different |
| 251 | { |
| 252 | #RCONST=0xCA62C1D6 "out of range for signed 32bit displacement" |
| 253 | RCONST=-0x359D3E2A |
| 254 | RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63; RD2 bx cx dx bp ax 64 |
| 255 | RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67; RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69 |
| 256 | RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71; RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74 |
| 257 | RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79 |
Denys Vlasenko | 7abb2bb | 2022-01-03 17:02:48 +0100 | [diff] [blame] | 258 | # Note: new W[n&15] values generated in last 3 iterations |
| 259 | # (W[13,14,15]) are unused after each of these iterations. |
| 260 | # Since we use r8..r15 for W[8..15], this does not matter. |
| 261 | # If we switch to e.g. using r8..r15 for W[0..7], then saving of W[13,14,15] |
| 262 | # (the "movl %esi, `W32 $n0`" insn) is a dead store and can be removed. |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 263 | } | grep -v '^$' |
| 264 | |
| 265 | echo " |
| 266 | popq %rdi # |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 267 | popq %r12 # |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame^] | 268 | addl %eax, 80(%rdi) # ctx->hash[0] += a |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 269 | popq %r13 # |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame^] | 270 | addl %ebx, 84(%rdi) # ctx->hash[1] += b |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 271 | popq %r14 # |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame^] | 272 | addl %ecx, 88(%rdi) # ctx->hash[2] += c |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 273 | popq %r15 # |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame^] | 274 | addl %edx, 92(%rdi) # ctx->hash[3] += d |
| 275 | popq %rbx # |
| 276 | addl %ebp, 96(%rdi) # ctx->hash[4] += e |
| 277 | popq %rbp # |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 278 | |
| 279 | ret |
| 280 | .size sha1_process_block64, .-sha1_process_block64 |
| 281 | #endif" |