Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 1 | #!/bin/sh |
| 2 | |
| 3 | # We don't regenerate it on every "make" invocation - only by hand. |
| 4 | # The reason is that the changes to generated code are difficult |
| 5 | # to visualize by looking only at this script, it helps when the commit |
| 6 | # also contains the diff of the generated file. |
| 7 | exec >hash_md5_sha_x86-64.S |
| 8 | |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 9 | # Based on http://arctic.org/~dean/crypto/sha1.html. |
| 10 | # ("This SHA1 implementation is public domain.") |
| 11 | # |
| 12 | # x86-64 has at least SSE2 vector insns always available. |
| 13 | # We can use them without any CPUID checks (and without a need |
| 14 | # for a fallback code if needed insns are not available). |
| 15 | # This code uses them to calculate W[] ahead of time. |
| 16 | # |
| 17 | # Unfortunately, results are passed from vector unit to |
| 18 | # integer ALUs on the stack. MOVD/Q insns to move them directly |
| 19 | # from vector to integer registers are slower than store-to-load |
| 20 | # forwarding in LSU (on Skylake at least). |
| 21 | # |
| 22 | # The win against a purely integer code is small on Skylake, |
| 23 | # only about 7-8%. We offload about 1/3 of our operations to the vector unit. |
| 24 | # It can do 4 ops at once in one 128-bit register, |
| 25 | # but we have to use x2 of them because of W[0] complication, |
| 26 | # SSE2 has no "rotate each word by N bits" insns, |
| 27 | # moving data to/from vector unit is clunky, and Skylake |
| 28 | # has four integer ALUs unified with three vector ALUs, |
| 29 | # which makes pure integer code rather fast, and makes |
| 30 | # vector ops compete with integer ones. |
| 31 | # |
| 32 | # Zen3, with its separate vector ALUs, wins more, about 12%. |
| 33 | |
| 34 | xmmT1="%xmm4" |
| 35 | xmmT2="%xmm5" |
| 36 | xmmRCONST="%xmm6" |
Denys Vlasenko | 4923f74 | 2022-02-08 03:29:16 +0100 | [diff] [blame] | 37 | xmmALLRCONST="%xmm7" |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 38 | T=`printf '\t'` |
| 39 | |
| 40 | # SSE instructions are longer than 4 bytes on average. |
| 41 | # Intel CPUs (up to Tiger Lake at least) can't decode |
| 42 | # more than 16 bytes of code in one cycle. |
| 43 | # By interleaving SSE code and integer code |
| 44 | # we mostly achieve a situation where 16-byte decode fetch window |
| 45 | # contains 4 (or more) insns. |
| 46 | # |
| 47 | # However. On Skylake, there was no observed difference, |
| 48 | # but on Zen3, non-interleaved code is ~3% faster |
| 49 | # (822 Mb/s versus 795 Mb/s hashing speed). |
| 50 | # Off for now: |
| 51 | interleave=false |
| 52 | |
| 53 | INTERLEAVE() { |
| 54 | $interleave || \ |
| 55 | { |
| 56 | # Generate non-interleaved code |
| 57 | # (it should work correctly too) |
| 58 | echo "$1" |
| 59 | echo "$2" |
| 60 | return |
| 61 | } |
| 62 | ( |
| 63 | echo "$1" | grep -v '^$' >"$0.temp1" |
| 64 | echo "$2" | grep -v '^$' >"$0.temp2" |
| 65 | exec 3<"$0.temp1" |
| 66 | exec 4<"$0.temp2" |
| 67 | IFS='' |
| 68 | while :; do |
| 69 | line1='' |
| 70 | line2='' |
| 71 | while :; do |
| 72 | read -r line1 <&3 |
| 73 | if test "${line1:0:1}" != "#" && test "${line1:0:2}" != "$T#"; then |
| 74 | break |
| 75 | fi |
| 76 | echo "$line1" |
| 77 | done |
| 78 | while :; do |
| 79 | read -r line2 <&4 |
| 80 | if test "${line2:0:4}" = "${T}lea"; then |
| 81 | # We use 7-8 byte long forms of LEA. |
| 82 | # Do not interleave them with SSE insns |
| 83 | # which are also long. |
| 84 | echo "$line2" |
| 85 | read -r line2 <&4 |
| 86 | echo "$line2" |
| 87 | continue |
| 88 | fi |
| 89 | if test "${line2:0:1}" != "#" && test "${line2:0:2}" != "$T#"; then |
| 90 | break |
| 91 | fi |
| 92 | echo "$line2" |
| 93 | done |
| 94 | test "$line1$line2" || break |
| 95 | echo "$line1" |
| 96 | echo "$line2" |
| 97 | done |
| 98 | rm "$0.temp1" "$0.temp2" |
| 99 | ) |
| 100 | } |
Denys Vlasenko | 1433568 | 2022-01-08 22:43:24 +0100 | [diff] [blame] | 101 | |
Denys Vlasenko | 1f272c0 | 2022-02-11 23:03:27 +0100 | [diff] [blame] | 102 | # movaps bswap32_mask(%rip), $xmmT1 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 103 | # Load W[] to xmm0..3, byteswapping on the fly. |
| 104 | # For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14 |
| 105 | # for use in RD1As instead of spilling them to stack. |
| 106 | # (We use rsi instead of rN because this makes two |
| 107 | # ADDs in two first RD1As shorter by one byte). |
| 108 | # movups 16*0(%rdi), %xmm0 |
| 109 | # pshufb $xmmT1, %xmm0 #SSSE3 insn |
| 110 | # movaps %xmm0, $xmmT2 |
| 111 | # paddd $xmmRCONST, $xmmT2 |
| 112 | # movq $xmmT2, %rsi |
Denys Vlasenko | 1f272c0 | 2022-02-11 23:03:27 +0100 | [diff] [blame] | 113 | # #pextrq \$1, $xmmT2, %r8 #SSE4.1 insn |
| 114 | # #movhpd $xmmT2, %r8 #can only move to mem, not to reg |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 115 | # shufps \$0x0e, $xmmT2, $xmmT2 # have to use two-insn sequence |
| 116 | # movq $xmmT2, %r8 # instead |
| 117 | # ... |
| 118 | # <repeat for xmm1,2,3> |
| 119 | # ... |
| 120 | #- leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] |
| 121 | #+ addl %esi, %e$e # e += RCONST + W[n] |
| 122 | # ^^^^^^^^^^^^^^^^^^^^^^^^ |
| 123 | # The above is -97 bytes of code... |
| 124 | # ...but pshufb is a SSSE3 insn. Can't use it. |
| 125 | |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 126 | echo \ |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 127 | "### Generated by hash_md5_sha_x86-64.S.sh ### |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 128 | |
| 129 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) |
Ludwig Nussel | 526625b | 2022-08-25 12:48:07 +0200 | [diff] [blame^] | 130 | #ifdef __linux__ |
| 131 | .section .note.GNU-stack, \"\", @progbits |
| 132 | #endif |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 133 | .section .text.sha1_process_block64, \"ax\", @progbits |
Denys Vlasenko | 805ecec | 2022-01-08 00:41:09 +0100 | [diff] [blame] | 134 | .globl sha1_process_block64 |
| 135 | .hidden sha1_process_block64 |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 136 | .type sha1_process_block64, @function |
| 137 | |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 138 | .balign 8 # allow decoders to fetch at least 5 first insns |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 139 | sha1_process_block64: |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 140 | pushq %rbp # 1 byte insn |
| 141 | pushq %rbx # 1 byte insn |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 142 | # pushq %r15 # 2 byte insn |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 143 | pushq %r14 # 2 byte insn |
| 144 | pushq %r13 # 2 byte insn |
| 145 | pushq %r12 # 2 byte insn |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 146 | pushq %rdi # we need ctx at the end |
| 147 | |
| 148 | #Register and stack use: |
| 149 | # eax..edx: a..d |
| 150 | # ebp: e |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 151 | # esi,edi,r8..r14: temps |
| 152 | # r15: unused |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 153 | # xmm0..xmm3: W[] |
| 154 | # xmm4,xmm5: temps |
| 155 | # xmm6: current round constant |
Denys Vlasenko | 4923f74 | 2022-02-08 03:29:16 +0100 | [diff] [blame] | 156 | # xmm7: all round constants |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 157 | # -64(%rsp): area for passing RCONST + W[] from vector to integer units |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 158 | |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 159 | movl 80(%rdi), %eax # a = ctx->hash[0] |
| 160 | movl 84(%rdi), %ebx # b = ctx->hash[1] |
| 161 | movl 88(%rdi), %ecx # c = ctx->hash[2] |
| 162 | movl 92(%rdi), %edx # d = ctx->hash[3] |
| 163 | movl 96(%rdi), %ebp # e = ctx->hash[4] |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 164 | |
| 165 | movaps sha1const(%rip), $xmmALLRCONST |
| 166 | pshufd \$0x00, $xmmALLRCONST, $xmmRCONST |
| 167 | |
| 168 | # Load W[] to xmm0..3, byteswapping on the fly. |
| 169 | # |
| 170 | # For iterations 0..15, we pass W[] in rsi,r8..r14 |
| 171 | # for use in RD1As instead of spilling them to stack. |
| 172 | # We lose parallelized addition of RCONST, but LEA |
| 173 | # can do two additions at once, so it is probably a wash. |
| 174 | # (We use rsi instead of rN because this makes two |
| 175 | # LEAs in two first RD1As shorter by one byte). |
| 176 | movq 4*0(%rdi), %rsi |
| 177 | movq 4*2(%rdi), %r8 |
| 178 | bswapq %rsi |
| 179 | bswapq %r8 |
| 180 | rolq \$32, %rsi # rsi = W[1]:W[0] |
| 181 | rolq \$32, %r8 # r8 = W[3]:W[2] |
| 182 | movq %rsi, %xmm0 |
| 183 | movq %r8, $xmmT1 |
| 184 | punpcklqdq $xmmT1, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) |
| 185 | # movaps %xmm0, $xmmT1 # add RCONST, spill to stack |
| 186 | # paddd $xmmRCONST, $xmmT1 |
| 187 | # movups $xmmT1, -64+16*0(%rsp) |
| 188 | |
| 189 | movq 4*4(%rdi), %r9 |
| 190 | movq 4*6(%rdi), %r10 |
| 191 | bswapq %r9 |
| 192 | bswapq %r10 |
| 193 | rolq \$32, %r9 # r9 = W[5]:W[4] |
| 194 | rolq \$32, %r10 # r10 = W[7]:W[6] |
| 195 | movq %r9, %xmm1 |
| 196 | movq %r10, $xmmT1 |
| 197 | punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) |
| 198 | |
| 199 | movq 4*8(%rdi), %r11 |
| 200 | movq 4*10(%rdi), %r12 |
| 201 | bswapq %r11 |
| 202 | bswapq %r12 |
Denys Vlasenko | 1f272c0 | 2022-02-11 23:03:27 +0100 | [diff] [blame] | 203 | rolq \$32, %r11 # r11 = W[9]:W[8] |
| 204 | rolq \$32, %r12 # r12 = W[11]:W[10] |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 205 | movq %r11, %xmm2 |
| 206 | movq %r12, $xmmT1 |
| 207 | punpcklqdq $xmmT1, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) |
| 208 | |
| 209 | movq 4*12(%rdi), %r13 |
| 210 | movq 4*14(%rdi), %r14 |
| 211 | bswapq %r13 |
| 212 | bswapq %r14 |
Denys Vlasenko | 1f272c0 | 2022-02-11 23:03:27 +0100 | [diff] [blame] | 213 | rolq \$32, %r13 # r13 = W[13]:W[12] |
| 214 | rolq \$32, %r14 # r14 = W[15]:W[14] |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 215 | movq %r13, %xmm3 |
| 216 | movq %r14, $xmmT1 |
| 217 | punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 218 | " |
| 219 | |
| 220 | PREP() { |
| 221 | local xmmW0=$1 |
| 222 | local xmmW4=$2 |
| 223 | local xmmW8=$3 |
| 224 | local xmmW12=$4 |
| 225 | # the above must be %xmm0..3 in some permutation |
| 226 | local dstmem=$5 |
| 227 | #W[0] = rol(W[13] ^ W[8] ^ W[2] ^ W[0], 1); |
| 228 | #W[1] = rol(W[14] ^ W[9] ^ W[3] ^ W[1], 1); |
| 229 | #W[2] = rol(W[15] ^ W[10] ^ W[4] ^ W[2], 1); |
| 230 | #W[3] = rol( 0 ^ W[11] ^ W[5] ^ W[3], 1); |
| 231 | #W[3] ^= rol(W[0], 1); |
| 232 | echo "# PREP $@ |
| 233 | movaps $xmmW12, $xmmT1 |
| 234 | psrldq \$4, $xmmT1 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
| 235 | |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 236 | # pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 237 | # punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 238 | # same result as above, but shorter and faster: |
| 239 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 240 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 241 | movaps $xmmW0, $xmmT2 |
| 242 | shufps \$0x4e, $xmmW4, $xmmT2 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 243 | |
| 244 | xorps $xmmW8, $xmmW0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 245 | xorps $xmmT1, $xmmT2 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 246 | xorps $xmmT2, $xmmW0 # ^ |
| 247 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 248 | movaps $xmmW0, $xmmT2 |
| 249 | |
| 250 | xorps $xmmT1, $xmmT1 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 251 | pcmpgtd $xmmW0, $xmmT1 # ffffffff for elements <0 (ones with msb bit 1) |
| 252 | paddd $xmmW0, $xmmW0 # shift left by 1 |
| 253 | psubd $xmmT1, $xmmW0 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 254 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 255 | |
| 256 | pslldq \$12, $xmmT2 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 257 | movaps $xmmT2, $xmmT1 |
| 258 | pslld \$2, $xmmT2 |
| 259 | psrld \$30, $xmmT1 |
| 260 | # xorps $xmmT1, $xmmT2 # rol((0,0,0,unrotW[0]),2) |
| 261 | xorps $xmmT1, $xmmW0 # same result, but does not depend on/does not modify T2 |
| 262 | |
| 263 | xorps $xmmT2, $xmmW0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 264 | " |
| 265 | # movq $xmmW0, %r8 # high latency (~6 cycles) |
| 266 | # movaps $xmmW0, $xmmT1 |
| 267 | # psrldq \$8, $xmmT1 # rshift by 8 bytes: move upper 64 bits to lower |
| 268 | # movq $xmmT1, %r10 # high latency |
| 269 | # movq %r8, %r9 |
| 270 | # movq %r10, %r11 |
| 271 | # shrq \$32, %r9 |
| 272 | # shrq \$32, %r11 |
| 273 | # ^^^ slower than passing the results on stack (!!!) |
| 274 | echo " |
| 275 | movaps $xmmW0, $xmmT2 |
| 276 | paddd $xmmRCONST, $xmmT2 |
| 277 | movups $xmmT2, $dstmem |
| 278 | " |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 279 | } |
| 280 | |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 281 | # It's possible to interleave integer insns in rounds to mostly eliminate |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 282 | # dependency chains, but this likely to only help old Pentium-based |
| 283 | # CPUs (ones without OOO, which can only simultaneously execute a pair |
| 284 | # of _adjacent_ insns). |
| 285 | # Testing on old-ish Silvermont CPU (which has OOO window of only |
| 286 | # about ~8 insns) shows very small (~1%) speedup. |
| 287 | |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 288 | RD1A() { |
| 289 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 |
| 290 | local n=$(($6)) |
Denys Vlasenko | 7abb2bb | 2022-01-03 17:02:48 +0100 | [diff] [blame] | 291 | local n0=$(((n+0) & 15)) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 292 | local rN=$((7+n0/2)) |
Denys Vlasenko | 7abb2bb | 2022-01-03 17:02:48 +0100 | [diff] [blame] | 293 | echo " |
| 294 | # $n |
| 295 | ";test $n0 = 0 && echo " |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 296 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 297 | shrq \$32, %rsi |
| 298 | ";test $n0 = 1 && echo " |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 299 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 300 | ";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo " |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 301 | leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 302 | shrq \$32, %r$rN |
| 303 | ";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo " |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 304 | leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 305 | ";echo " |
| 306 | movl %e$c, %edi # c |
| 307 | xorl %e$d, %edi # ^d |
| 308 | andl %e$b, %edi # &b |
| 309 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 310 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 311 | movl %e$a, %edi # |
| 312 | roll \$5, %edi # rotl32(a,5) |
| 313 | addl %edi, %e$e # e += rotl32(a,5) |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 314 | rorl \$2, %e$b # b = rotl32(b,30) |
| 315 | " |
| 316 | } |
| 317 | RD1B() { |
| 318 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 |
| 319 | local n=$(($6)) |
| 320 | local n13=$(((n+13) & 15)) |
| 321 | local n8=$(((n+8) & 15)) |
| 322 | local n2=$(((n+2) & 15)) |
| 323 | local n0=$(((n+0) & 15)) |
| 324 | echo " |
| 325 | # $n |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 326 | movl %e$c, %edi # c |
| 327 | xorl %e$d, %edi # ^d |
| 328 | andl %e$b, %edi # &b |
| 329 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 330 | addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 331 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) |
| 332 | movl %e$a, %esi # |
| 333 | roll \$5, %esi # rotl32(a,5) |
| 334 | addl %esi, %e$e # e += rotl32(a,5) |
| 335 | rorl \$2, %e$b # b = rotl32(b,30) |
| 336 | " |
| 337 | } |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 338 | |
| 339 | RD2() { |
| 340 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 |
| 341 | local n=$(($6)) |
| 342 | local n13=$(((n+13) & 15)) |
| 343 | local n8=$(((n+8) & 15)) |
| 344 | local n2=$(((n+2) & 15)) |
| 345 | local n0=$(((n+0) & 15)) |
| 346 | echo " |
| 347 | # $n |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 348 | movl %e$c, %edi # c |
| 349 | xorl %e$d, %edi # ^d |
| 350 | xorl %e$b, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 351 | addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 352 | addl %edi, %e$e # e += (c ^ d ^ b) |
| 353 | movl %e$a, %esi # |
| 354 | roll \$5, %esi # rotl32(a,5) |
| 355 | addl %esi, %e$e # e += rotl32(a,5) |
| 356 | rorl \$2, %e$b # b = rotl32(b,30) |
| 357 | " |
| 358 | } |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 359 | |
| 360 | RD3() { |
| 361 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 |
| 362 | local n=$(($6)) |
| 363 | local n13=$(((n+13) & 15)) |
| 364 | local n8=$(((n+8) & 15)) |
| 365 | local n2=$(((n+2) & 15)) |
| 366 | local n0=$(((n+0) & 15)) |
| 367 | echo " |
| 368 | # $n |
| 369 | movl %e$b, %edi # di: b |
| 370 | movl %e$b, %esi # si: b |
| 371 | orl %e$c, %edi # di: b | c |
| 372 | andl %e$c, %esi # si: b & c |
| 373 | andl %e$d, %edi # di: (b | c) & d |
| 374 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 375 | addl %edi, %e$e # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 376 | addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 377 | movl %e$a, %esi # |
| 378 | roll \$5, %esi # rotl32(a,5) |
| 379 | addl %esi, %e$e # e += rotl32(a,5) |
| 380 | rorl \$2, %e$b # b = rotl32(b,30) |
| 381 | " |
| 382 | } |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 383 | |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 384 | { |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 385 | # Round 1 |
| 386 | RCONST=0x5A827999 |
| 387 | RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3; |
| 388 | RD1A bx cx dx bp ax 4; RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7; |
| 389 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` |
| 390 | b=`RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;` |
| 391 | INTERLEAVE "$a" "$b" |
Denys Vlasenko | 4923f74 | 2022-02-08 03:29:16 +0100 | [diff] [blame] | 392 | a=`echo " pshufd \\$0x55, $xmmALLRCONST, $xmmRCONST" |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 393 | PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` |
| 394 | b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;` |
| 395 | INTERLEAVE "$a" "$b" |
| 396 | a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` |
| 397 | b=`RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19;` |
| 398 | INTERLEAVE "$a" "$b" |
| 399 | |
| 400 | # Round 2 |
| 401 | RCONST=0x6ED9EBA1 |
| 402 | a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` |
| 403 | b=`RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23;` |
| 404 | INTERLEAVE "$a" "$b" |
| 405 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` |
| 406 | b=`RD2 bx cx dx bp ax 24; RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27;` |
| 407 | INTERLEAVE "$a" "$b" |
| 408 | a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` |
| 409 | b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;` |
| 410 | INTERLEAVE "$a" "$b" |
Denys Vlasenko | 4923f74 | 2022-02-08 03:29:16 +0100 | [diff] [blame] | 411 | a=`echo " pshufd \\$0xaa, $xmmALLRCONST, $xmmRCONST" |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 412 | PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` |
| 413 | b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;` |
| 414 | INTERLEAVE "$a" "$b" |
| 415 | a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` |
| 416 | b=`RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39;` |
| 417 | INTERLEAVE "$a" "$b" |
| 418 | |
| 419 | # Round 3 |
| 420 | RCONST=0x8F1BBCDC |
| 421 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` |
| 422 | b=`RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43;` |
| 423 | INTERLEAVE "$a" "$b" |
| 424 | a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` |
| 425 | b=`RD3 bx cx dx bp ax 44; RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47;` |
| 426 | INTERLEAVE "$a" "$b" |
| 427 | a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` |
| 428 | b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;` |
| 429 | INTERLEAVE "$a" "$b" |
Denys Vlasenko | 4923f74 | 2022-02-08 03:29:16 +0100 | [diff] [blame] | 430 | a=`echo " pshufd \\$0xff, $xmmALLRCONST, $xmmRCONST" |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 431 | PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` |
| 432 | b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;` |
| 433 | INTERLEAVE "$a" "$b" |
| 434 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` |
| 435 | b=`RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59;` |
| 436 | INTERLEAVE "$a" "$b" |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 437 | |
| 438 | # Round 4 has the same logic as round 2, only n and RCONST are different |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 439 | RCONST=0xCA62C1D6 |
| 440 | a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` |
| 441 | b=`RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63;` |
| 442 | INTERLEAVE "$a" "$b" |
| 443 | a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` |
| 444 | b=`RD2 bx cx dx bp ax 64; RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67;` |
| 445 | INTERLEAVE "$a" "$b" |
| 446 | a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` |
| 447 | b=`RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69; RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71;` |
| 448 | INTERLEAVE "$a" "$b" |
| 449 | RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74; RD2 ax bx cx dx bp 75; |
| 450 | RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79; |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 451 | } | grep -v '^$' |
| 452 | |
| 453 | echo " |
| 454 | popq %rdi # |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 455 | popq %r12 # |
Denys Vlasenko | 805ecec | 2022-01-08 00:41:09 +0100 | [diff] [blame] | 456 | addl %eax, 80(%rdi) # ctx->hash[0] += a |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 457 | popq %r13 # |
Denys Vlasenko | 805ecec | 2022-01-08 00:41:09 +0100 | [diff] [blame] | 458 | addl %ebx, 84(%rdi) # ctx->hash[1] += b |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 459 | popq %r14 # |
Denys Vlasenko | 805ecec | 2022-01-08 00:41:09 +0100 | [diff] [blame] | 460 | addl %ecx, 88(%rdi) # ctx->hash[2] += c |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 461 | # popq %r15 # |
Denys Vlasenko | 805ecec | 2022-01-08 00:41:09 +0100 | [diff] [blame] | 462 | addl %edx, 92(%rdi) # ctx->hash[3] += d |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 463 | popq %rbx # |
Denys Vlasenko | 805ecec | 2022-01-08 00:41:09 +0100 | [diff] [blame] | 464 | addl %ebp, 96(%rdi) # ctx->hash[4] += e |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 465 | popq %rbp # |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 466 | |
| 467 | ret |
| 468 | .size sha1_process_block64, .-sha1_process_block64 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 469 | |
| 470 | .section .rodata.cst16.sha1const, \"aM\", @progbits, 16 |
Denys Vlasenko | 6472ac9 | 2022-02-03 14:15:20 +0100 | [diff] [blame] | 471 | .balign 16 |
Denys Vlasenko | 4923f74 | 2022-02-08 03:29:16 +0100 | [diff] [blame] | 472 | sha1const: |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 473 | .long 0x5A827999 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 474 | .long 0x6ED9EBA1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 475 | .long 0x8F1BBCDC |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 476 | .long 0xCA62C1D6 |
| 477 | |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 478 | #endif" |