Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 1 | #!/bin/sh |
| 2 | |
| 3 | # We don't regenerate it on every "make" invocation - only by hand. |
| 4 | # The reason is that the changes to generated code are difficult |
| 5 | # to visualize by looking only at this script, it helps when the commit |
| 6 | # also contains the diff of the generated file. |
| 7 | exec >hash_md5_sha_x86-64.S |
| 8 | |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 9 | # Based on http://arctic.org/~dean/crypto/sha1.html. |
| 10 | # ("This SHA1 implementation is public domain.") |
| 11 | # |
| 12 | # x86-64 has at least SSE2 vector insns always available. |
| 13 | # We can use them without any CPUID checks (and without a need |
| 14 | # for a fallback code if needed insns are not available). |
| 15 | # This code uses them to calculate W[] ahead of time. |
| 16 | # |
| 17 | # Unfortunately, results are passed from vector unit to |
| 18 | # integer ALUs on the stack. MOVD/Q insns to move them directly |
| 19 | # from vector to integer registers are slower than store-to-load |
| 20 | # forwarding in LSU (on Skylake at least). |
| 21 | # |
| 22 | # The win against a purely integer code is small on Skylake, |
| 23 | # only about 7-8%. We offload about 1/3 of our operations to the vector unit. |
| 24 | # It can do 4 ops at once in one 128-bit register, |
| 25 | # but we have to use x2 of them because of W[0] complication, |
| 26 | # SSE2 has no "rotate each word by N bits" insns, |
| 27 | # moving data to/from vector unit is clunky, and Skylake |
| 28 | # has four integer ALUs unified with three vector ALUs, |
| 29 | # which makes pure integer code rather fast, and makes |
| 30 | # vector ops compete with integer ones. |
| 31 | # |
| 32 | # Zen3, with its separate vector ALUs, wins more, about 12%. |
| 33 | |
| 34 | xmmT1="%xmm4" |
| 35 | xmmT2="%xmm5" |
| 36 | xmmRCONST="%xmm6" |
| 37 | T=`printf '\t'` |
| 38 | |
| 39 | # SSE instructions are longer than 4 bytes on average. |
| 40 | # Intel CPUs (up to Tiger Lake at least) can't decode |
| 41 | # more than 16 bytes of code in one cycle. |
| 42 | # By interleaving SSE code and integer code |
| 43 | # we mostly achieve a situation where 16-byte decode fetch window |
| 44 | # contains 4 (or more) insns. |
| 45 | # |
| 46 | # However. On Skylake, there was no observed difference, |
| 47 | # but on Zen3, non-interleaved code is ~3% faster |
| 48 | # (822 Mb/s versus 795 Mb/s hashing speed). |
| 49 | # Off for now: |
| 50 | interleave=false |
| 51 | |
| 52 | INTERLEAVE() { |
| 53 | $interleave || \ |
| 54 | { |
| 55 | # Generate non-interleaved code |
| 56 | # (it should work correctly too) |
| 57 | echo "$1" |
| 58 | echo "$2" |
| 59 | return |
| 60 | } |
| 61 | ( |
| 62 | echo "$1" | grep -v '^$' >"$0.temp1" |
| 63 | echo "$2" | grep -v '^$' >"$0.temp2" |
| 64 | exec 3<"$0.temp1" |
| 65 | exec 4<"$0.temp2" |
| 66 | IFS='' |
| 67 | while :; do |
| 68 | line1='' |
| 69 | line2='' |
| 70 | while :; do |
| 71 | read -r line1 <&3 |
| 72 | if test "${line1:0:1}" != "#" && test "${line1:0:2}" != "$T#"; then |
| 73 | break |
| 74 | fi |
| 75 | echo "$line1" |
| 76 | done |
| 77 | while :; do |
| 78 | read -r line2 <&4 |
| 79 | if test "${line2:0:4}" = "${T}lea"; then |
| 80 | # We use 7-8 byte long forms of LEA. |
| 81 | # Do not interleave them with SSE insns |
| 82 | # which are also long. |
| 83 | echo "$line2" |
| 84 | read -r line2 <&4 |
| 85 | echo "$line2" |
| 86 | continue |
| 87 | fi |
| 88 | if test "${line2:0:1}" != "#" && test "${line2:0:2}" != "$T#"; then |
| 89 | break |
| 90 | fi |
| 91 | echo "$line2" |
| 92 | done |
| 93 | test "$line1$line2" || break |
| 94 | echo "$line1" |
| 95 | echo "$line2" |
| 96 | done |
| 97 | rm "$0.temp1" "$0.temp2" |
| 98 | ) |
| 99 | } |
Denys Vlasenko | 1433568 | 2022-01-08 22:43:24 +0100 | [diff] [blame] | 100 | |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 101 | echo \ |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 102 | "### Generated by hash_md5_sha_x86-64.S.sh ### |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 103 | |
| 104 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 105 | .section .text.sha1_process_block64, \"ax\", @progbits |
Denys Vlasenko | 805ecec | 2022-01-08 00:41:09 +0100 | [diff] [blame] | 106 | .globl sha1_process_block64 |
| 107 | .hidden sha1_process_block64 |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 108 | .type sha1_process_block64, @function |
| 109 | |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 110 | .balign 8 # allow decoders to fetch at least 5 first insns |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 111 | sha1_process_block64: |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 112 | pushq %rbp # 1 byte insn |
| 113 | pushq %rbx # 1 byte insn |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 114 | # pushq %r15 # 2 byte insn |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 115 | pushq %r14 # 2 byte insn |
| 116 | pushq %r13 # 2 byte insn |
| 117 | pushq %r12 # 2 byte insn |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 118 | pushq %rdi # we need ctx at the end |
| 119 | |
| 120 | #Register and stack use: |
| 121 | # eax..edx: a..d |
| 122 | # ebp: e |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 123 | # esi,edi,r8..r14: temps |
| 124 | # r15: unused |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 125 | # xmm0..xmm3: W[] |
| 126 | # xmm4,xmm5: temps |
| 127 | # xmm6: current round constant |
| 128 | # -64(%rsp): area for passing RCONST + W[] from vector to integer units |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 129 | |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 130 | movl 80(%rdi), %eax # a = ctx->hash[0] |
| 131 | movl 84(%rdi), %ebx # b = ctx->hash[1] |
| 132 | movl 88(%rdi), %ecx # c = ctx->hash[2] |
| 133 | movl 92(%rdi), %edx # d = ctx->hash[3] |
| 134 | movl 96(%rdi), %ebp # e = ctx->hash[4] |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 135 | |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 136 | movaps rconst0x5A827999(%rip), $xmmRCONST |
| 137 | |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 138 | # Load W[] to xmm registers, byteswapping on the fly. |
| 139 | # |
| 140 | # For iterations 0..15, we pass W[] in rsi,r8..r14 |
| 141 | # for use in RD1A's instead of spilling them to stack. |
| 142 | # We lose parallelized addition of RCONST, but LEA |
| 143 | # can do two additions at once, so it's probably a wash. |
| 144 | # (We use rsi instead of rN because this makes two |
| 145 | # LEAs in two first RD1A's shorter by one byte). |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 146 | movq 4*0(%rdi), %rsi |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 147 | movq 4*2(%rdi), %r8 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 148 | bswapq %rsi |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 149 | bswapq %r8 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 150 | rolq \$32, %rsi # rsi = W[1]:W[0] |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 151 | rolq \$32, %r8 # r8 = W[3]:W[2] |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 152 | movq %rsi, %xmm0 |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 153 | movq %r8, $xmmT1 |
| 154 | punpcklqdq $xmmT1, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) |
| 155 | # movaps %xmm0, $xmmT1 # add RCONST, spill to stack |
| 156 | # paddd $xmmRCONST, $xmmT1 |
| 157 | # movups $xmmT1, -64+16*0(%rsp) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 158 | |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 159 | movq 4*4(%rdi), %r9 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 160 | movq 4*6(%rdi), %r10 |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 161 | bswapq %r9 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 162 | bswapq %r10 |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 163 | rolq \$32, %r9 # r9 = W[5]:W[4] |
| 164 | rolq \$32, %r10 # r10 = W[7]:W[6] |
| 165 | movq %r9, %xmm1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 166 | movq %r10, $xmmT1 |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 167 | punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 168 | |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 169 | movq 4*8(%rdi), %r11 |
| 170 | movq 4*10(%rdi), %r12 |
| 171 | bswapq %r11 |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 172 | bswapq %r12 |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 173 | rolq \$32, %r11 # r11 = W[9]:W[8] |
| 174 | rolq \$32, %r12 # r12 = W[11]:W[10] |
| 175 | movq %r11, %xmm2 |
| 176 | movq %r12, $xmmT1 |
| 177 | punpcklqdq $xmmT1, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) |
| 178 | |
| 179 | movq 4*12(%rdi), %r13 |
| 180 | movq 4*14(%rdi), %r14 |
| 181 | bswapq %r13 |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 182 | bswapq %r14 |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 183 | rolq \$32, %r13 # r13 = W[13]:W[12] |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 184 | rolq \$32, %r14 # r14 = W[15]:W[14] |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 185 | movq %r13, %xmm3 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 186 | movq %r14, $xmmT1 |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 187 | punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 188 | " |
| 189 | |
| 190 | PREP() { |
| 191 | local xmmW0=$1 |
| 192 | local xmmW4=$2 |
| 193 | local xmmW8=$3 |
| 194 | local xmmW12=$4 |
| 195 | # the above must be %xmm0..3 in some permutation |
| 196 | local dstmem=$5 |
| 197 | #W[0] = rol(W[13] ^ W[8] ^ W[2] ^ W[0], 1); |
| 198 | #W[1] = rol(W[14] ^ W[9] ^ W[3] ^ W[1], 1); |
| 199 | #W[2] = rol(W[15] ^ W[10] ^ W[4] ^ W[2], 1); |
| 200 | #W[3] = rol( 0 ^ W[11] ^ W[5] ^ W[3], 1); |
| 201 | #W[3] ^= rol(W[0], 1); |
| 202 | echo "# PREP $@ |
| 203 | movaps $xmmW12, $xmmT1 |
| 204 | psrldq \$4, $xmmT1 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
| 205 | |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame^] | 206 | # pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 207 | # punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 208 | # same result as above, but shorter and faster: |
| 209 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 210 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 211 | movaps $xmmW0, $xmmT2 |
| 212 | shufps \$0x4e, $xmmW4, $xmmT2 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 213 | |
| 214 | xorps $xmmW8, $xmmW0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 215 | xorps $xmmT1, $xmmT2 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 216 | xorps $xmmT2, $xmmW0 # ^ |
| 217 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 218 | movaps $xmmW0, $xmmT2 |
| 219 | |
| 220 | xorps $xmmT1, $xmmT1 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 221 | pcmpgtd $xmmW0, $xmmT1 # ffffffff for elements <0 (ones with msb bit 1) |
| 222 | paddd $xmmW0, $xmmW0 # shift left by 1 |
| 223 | psubd $xmmT1, $xmmW0 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 224 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 225 | |
| 226 | pslldq \$12, $xmmT2 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 227 | movaps $xmmT2, $xmmT1 |
| 228 | pslld \$2, $xmmT2 |
| 229 | psrld \$30, $xmmT1 |
| 230 | # xorps $xmmT1, $xmmT2 # rol((0,0,0,unrotW[0]),2) |
| 231 | xorps $xmmT1, $xmmW0 # same result, but does not depend on/does not modify T2 |
| 232 | |
| 233 | xorps $xmmT2, $xmmW0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 234 | " |
| 235 | # movq $xmmW0, %r8 # high latency (~6 cycles) |
| 236 | # movaps $xmmW0, $xmmT1 |
| 237 | # psrldq \$8, $xmmT1 # rshift by 8 bytes: move upper 64 bits to lower |
| 238 | # movq $xmmT1, %r10 # high latency |
| 239 | # movq %r8, %r9 |
| 240 | # movq %r10, %r11 |
| 241 | # shrq \$32, %r9 |
| 242 | # shrq \$32, %r11 |
| 243 | # ^^^ slower than passing the results on stack (!!!) |
| 244 | echo " |
| 245 | movaps $xmmW0, $xmmT2 |
| 246 | paddd $xmmRCONST, $xmmT2 |
| 247 | movups $xmmT2, $dstmem |
| 248 | " |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 249 | } |
| 250 | |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 251 | # It's possible to interleave integer insns in rounds to mostly eliminate |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 252 | # dependency chains, but this likely to only help old Pentium-based |
| 253 | # CPUs (ones without OOO, which can only simultaneously execute a pair |
| 254 | # of _adjacent_ insns). |
| 255 | # Testing on old-ish Silvermont CPU (which has OOO window of only |
| 256 | # about ~8 insns) shows very small (~1%) speedup. |
| 257 | |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 258 | RD1A() { |
| 259 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 |
| 260 | local n=$(($6)) |
Denys Vlasenko | 7abb2bb | 2022-01-03 17:02:48 +0100 | [diff] [blame] | 261 | local n0=$(((n+0) & 15)) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 262 | local rN=$((7+n0/2)) |
Denys Vlasenko | 7abb2bb | 2022-01-03 17:02:48 +0100 | [diff] [blame] | 263 | echo " |
| 264 | # $n |
| 265 | ";test $n0 = 0 && echo " |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 266 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 267 | shrq \$32, %rsi |
| 268 | ";test $n0 = 1 && echo " |
| 269 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] |
| 270 | ";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo " |
| 271 | leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] |
| 272 | shrq \$32, %r$rN |
| 273 | ";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo " |
| 274 | leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 275 | ";echo " |
| 276 | movl %e$c, %edi # c |
| 277 | xorl %e$d, %edi # ^d |
| 278 | andl %e$b, %edi # &b |
| 279 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 280 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 281 | movl %e$a, %edi # |
| 282 | roll \$5, %edi # rotl32(a,5) |
| 283 | addl %edi, %e$e # e += rotl32(a,5) |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 284 | rorl \$2, %e$b # b = rotl32(b,30) |
| 285 | " |
| 286 | } |
| 287 | RD1B() { |
| 288 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 |
| 289 | local n=$(($6)) |
| 290 | local n13=$(((n+13) & 15)) |
| 291 | local n8=$(((n+8) & 15)) |
| 292 | local n2=$(((n+2) & 15)) |
| 293 | local n0=$(((n+0) & 15)) |
| 294 | echo " |
| 295 | # $n |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 296 | movl %e$c, %edi # c |
| 297 | xorl %e$d, %edi # ^d |
| 298 | andl %e$b, %edi # &b |
| 299 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 300 | addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 301 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) |
| 302 | movl %e$a, %esi # |
| 303 | roll \$5, %esi # rotl32(a,5) |
| 304 | addl %esi, %e$e # e += rotl32(a,5) |
| 305 | rorl \$2, %e$b # b = rotl32(b,30) |
| 306 | " |
| 307 | } |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 308 | |
| 309 | RD2() { |
| 310 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 |
| 311 | local n=$(($6)) |
| 312 | local n13=$(((n+13) & 15)) |
| 313 | local n8=$(((n+8) & 15)) |
| 314 | local n2=$(((n+2) & 15)) |
| 315 | local n0=$(((n+0) & 15)) |
| 316 | echo " |
| 317 | # $n |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 318 | movl %e$c, %edi # c |
| 319 | xorl %e$d, %edi # ^d |
| 320 | xorl %e$b, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 321 | addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 322 | addl %edi, %e$e # e += (c ^ d ^ b) |
| 323 | movl %e$a, %esi # |
| 324 | roll \$5, %esi # rotl32(a,5) |
| 325 | addl %esi, %e$e # e += rotl32(a,5) |
| 326 | rorl \$2, %e$b # b = rotl32(b,30) |
| 327 | " |
| 328 | } |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 329 | |
| 330 | RD3() { |
| 331 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 |
| 332 | local n=$(($6)) |
| 333 | local n13=$(((n+13) & 15)) |
| 334 | local n8=$(((n+8) & 15)) |
| 335 | local n2=$(((n+2) & 15)) |
| 336 | local n0=$(((n+0) & 15)) |
| 337 | echo " |
| 338 | # $n |
| 339 | movl %e$b, %edi # di: b |
| 340 | movl %e$b, %esi # si: b |
| 341 | orl %e$c, %edi # di: b | c |
| 342 | andl %e$c, %esi # si: b & c |
| 343 | andl %e$d, %edi # di: (b | c) & d |
| 344 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 345 | addl %edi, %e$e # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 346 | addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 347 | movl %e$a, %esi # |
| 348 | roll \$5, %esi # rotl32(a,5) |
| 349 | addl %esi, %e$e # e += rotl32(a,5) |
| 350 | rorl \$2, %e$b # b = rotl32(b,30) |
| 351 | " |
| 352 | } |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 353 | |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 354 | { |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 355 | # Round 1 |
| 356 | RCONST=0x5A827999 |
| 357 | RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3; |
| 358 | RD1A bx cx dx bp ax 4; RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7; |
| 359 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` |
| 360 | b=`RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;` |
| 361 | INTERLEAVE "$a" "$b" |
| 362 | a=`echo " movaps rconst0x6ED9EBA1(%rip), $xmmRCONST" |
| 363 | PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` |
| 364 | b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;` |
| 365 | INTERLEAVE "$a" "$b" |
| 366 | a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` |
| 367 | b=`RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19;` |
| 368 | INTERLEAVE "$a" "$b" |
| 369 | |
| 370 | # Round 2 |
| 371 | RCONST=0x6ED9EBA1 |
| 372 | a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` |
| 373 | b=`RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23;` |
| 374 | INTERLEAVE "$a" "$b" |
| 375 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` |
| 376 | b=`RD2 bx cx dx bp ax 24; RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27;` |
| 377 | INTERLEAVE "$a" "$b" |
| 378 | a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` |
| 379 | b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;` |
| 380 | INTERLEAVE "$a" "$b" |
| 381 | a=`echo " movaps rconst0x8F1BBCDC(%rip), $xmmRCONST" |
| 382 | PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` |
| 383 | b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;` |
| 384 | INTERLEAVE "$a" "$b" |
| 385 | a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` |
| 386 | b=`RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39;` |
| 387 | INTERLEAVE "$a" "$b" |
| 388 | |
| 389 | # Round 3 |
| 390 | RCONST=0x8F1BBCDC |
| 391 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` |
| 392 | b=`RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43;` |
| 393 | INTERLEAVE "$a" "$b" |
| 394 | a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` |
| 395 | b=`RD3 bx cx dx bp ax 44; RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47;` |
| 396 | INTERLEAVE "$a" "$b" |
| 397 | a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` |
| 398 | b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;` |
| 399 | INTERLEAVE "$a" "$b" |
| 400 | a=`echo " movaps rconst0xCA62C1D6(%rip), $xmmRCONST" |
| 401 | PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` |
| 402 | b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;` |
| 403 | INTERLEAVE "$a" "$b" |
| 404 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` |
| 405 | b=`RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59;` |
| 406 | INTERLEAVE "$a" "$b" |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 407 | |
| 408 | # Round 4 has the same logic as round 2, only n and RCONST are different |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 409 | RCONST=0xCA62C1D6 |
| 410 | a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` |
| 411 | b=`RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63;` |
| 412 | INTERLEAVE "$a" "$b" |
| 413 | a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` |
| 414 | b=`RD2 bx cx dx bp ax 64; RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67;` |
| 415 | INTERLEAVE "$a" "$b" |
| 416 | a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` |
| 417 | b=`RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69; RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71;` |
| 418 | INTERLEAVE "$a" "$b" |
| 419 | RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74; RD2 ax bx cx dx bp 75; |
| 420 | RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79; |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 421 | } | grep -v '^$' |
| 422 | |
| 423 | echo " |
| 424 | popq %rdi # |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 425 | popq %r12 # |
Denys Vlasenko | 805ecec | 2022-01-08 00:41:09 +0100 | [diff] [blame] | 426 | addl %eax, 80(%rdi) # ctx->hash[0] += a |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 427 | popq %r13 # |
Denys Vlasenko | 805ecec | 2022-01-08 00:41:09 +0100 | [diff] [blame] | 428 | addl %ebx, 84(%rdi) # ctx->hash[1] += b |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 429 | popq %r14 # |
Denys Vlasenko | 805ecec | 2022-01-08 00:41:09 +0100 | [diff] [blame] | 430 | addl %ecx, 88(%rdi) # ctx->hash[2] += c |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 431 | # popq %r15 # |
Denys Vlasenko | 805ecec | 2022-01-08 00:41:09 +0100 | [diff] [blame] | 432 | addl %edx, 92(%rdi) # ctx->hash[3] += d |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 433 | popq %rbx # |
Denys Vlasenko | 805ecec | 2022-01-08 00:41:09 +0100 | [diff] [blame] | 434 | addl %ebp, 96(%rdi) # ctx->hash[4] += e |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 435 | popq %rbp # |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 436 | |
| 437 | ret |
| 438 | .size sha1_process_block64, .-sha1_process_block64 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 439 | |
| 440 | .section .rodata.cst16.sha1const, \"aM\", @progbits, 16 |
Denys Vlasenko | 6472ac9 | 2022-02-03 14:15:20 +0100 | [diff] [blame] | 441 | .balign 16 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 442 | rconst0x5A827999: |
| 443 | .long 0x5A827999 |
| 444 | .long 0x5A827999 |
| 445 | .long 0x5A827999 |
| 446 | .long 0x5A827999 |
| 447 | rconst0x6ED9EBA1: |
| 448 | .long 0x6ED9EBA1 |
| 449 | .long 0x6ED9EBA1 |
| 450 | .long 0x6ED9EBA1 |
| 451 | .long 0x6ED9EBA1 |
| 452 | rconst0x8F1BBCDC: |
| 453 | .long 0x8F1BBCDC |
| 454 | .long 0x8F1BBCDC |
| 455 | .long 0x8F1BBCDC |
| 456 | .long 0x8F1BBCDC |
| 457 | rconst0xCA62C1D6: |
| 458 | .long 0xCA62C1D6 |
| 459 | .long 0xCA62C1D6 |
| 460 | .long 0xCA62C1D6 |
| 461 | .long 0xCA62C1D6 |
| 462 | |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 463 | #endif" |