Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1 | ### Generated by hash_md5_sha_x86-64.S.sh ### |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 2 | |
| 3 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) |
Ludwig Nussel | 526625b | 2022-08-25 12:48:07 +0200 | [diff] [blame] | 4 | #ifdef __linux__ |
| 5 | .section .note.GNU-stack, "", @progbits |
| 6 | #endif |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 7 | .section .text.sha1_process_block64, "ax", @progbits |
Denys Vlasenko | 805ecec | 2022-01-08 00:41:09 +0100 | [diff] [blame] | 8 | .globl sha1_process_block64 |
| 9 | .hidden sha1_process_block64 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 10 | .type sha1_process_block64, @function |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 11 | |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 12 | .balign 8 # allow decoders to fetch at least 5 first insns |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 13 | sha1_process_block64: |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 14 | pushq %rbp # 1 byte insn |
| 15 | pushq %rbx # 1 byte insn |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 16 | # pushq %r15 # 2 byte insn |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 17 | pushq %r14 # 2 byte insn |
| 18 | pushq %r13 # 2 byte insn |
| 19 | pushq %r12 # 2 byte insn |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 20 | pushq %rdi # we need ctx at the end |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 21 | |
| 22 | #Register and stack use: |
| 23 | # eax..edx: a..d |
| 24 | # ebp: e |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 25 | # esi,edi,r8..r14: temps |
| 26 | # r15: unused |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 27 | # xmm0..xmm3: W[] |
| 28 | # xmm4,xmm5: temps |
| 29 | # xmm6: current round constant |
Denys Vlasenko | 4923f74 | 2022-02-08 03:29:16 +0100 | [diff] [blame] | 30 | # xmm7: all round constants |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 31 | # -64(%rsp): area for passing RCONST + W[] from vector to integer units |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 32 | |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 33 | movl 80(%rdi), %eax # a = ctx->hash[0] |
| 34 | movl 84(%rdi), %ebx # b = ctx->hash[1] |
| 35 | movl 88(%rdi), %ecx # c = ctx->hash[2] |
| 36 | movl 92(%rdi), %edx # d = ctx->hash[3] |
| 37 | movl 96(%rdi), %ebp # e = ctx->hash[4] |
| 38 | |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 39 | movaps sha1const(%rip), %xmm7 |
| 40 | pshufd $0x00, %xmm7, %xmm6 |
| 41 | |
| 42 | # Load W[] to xmm0..3, byteswapping on the fly. |
| 43 | # |
| 44 | # For iterations 0..15, we pass W[] in rsi,r8..r14 |
| 45 | # for use in RD1As instead of spilling them to stack. |
| 46 | # We lose parallelized addition of RCONST, but LEA |
| 47 | # can do two additions at once, so it is probably a wash. |
| 48 | # (We use rsi instead of rN because this makes two |
| 49 | # LEAs in two first RD1As shorter by one byte). |
| 50 | movq 4*0(%rdi), %rsi |
| 51 | movq 4*2(%rdi), %r8 |
| 52 | bswapq %rsi |
| 53 | bswapq %r8 |
| 54 | rolq $32, %rsi # rsi = W[1]:W[0] |
| 55 | rolq $32, %r8 # r8 = W[3]:W[2] |
| 56 | movq %rsi, %xmm0 |
| 57 | movq %r8, %xmm4 |
| 58 | punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) |
| 59 | # movaps %xmm0, %xmm4 # add RCONST, spill to stack |
| 60 | # paddd %xmm6, %xmm4 |
| 61 | # movups %xmm4, -64+16*0(%rsp) |
| 62 | |
| 63 | movq 4*4(%rdi), %r9 |
| 64 | movq 4*6(%rdi), %r10 |
| 65 | bswapq %r9 |
| 66 | bswapq %r10 |
| 67 | rolq $32, %r9 # r9 = W[5]:W[4] |
| 68 | rolq $32, %r10 # r10 = W[7]:W[6] |
| 69 | movq %r9, %xmm1 |
| 70 | movq %r10, %xmm4 |
| 71 | punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) |
| 72 | |
| 73 | movq 4*8(%rdi), %r11 |
| 74 | movq 4*10(%rdi), %r12 |
| 75 | bswapq %r11 |
| 76 | bswapq %r12 |
Denys Vlasenko | 1f272c0 | 2022-02-11 23:03:27 +0100 | [diff] [blame] | 77 | rolq $32, %r11 # r11 = W[9]:W[8] |
| 78 | rolq $32, %r12 # r12 = W[11]:W[10] |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 79 | movq %r11, %xmm2 |
| 80 | movq %r12, %xmm4 |
| 81 | punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) |
| 82 | |
| 83 | movq 4*12(%rdi), %r13 |
| 84 | movq 4*14(%rdi), %r14 |
| 85 | bswapq %r13 |
| 86 | bswapq %r14 |
Denys Vlasenko | 1f272c0 | 2022-02-11 23:03:27 +0100 | [diff] [blame] | 87 | rolq $32, %r13 # r13 = W[13]:W[12] |
| 88 | rolq $32, %r14 # r14 = W[15]:W[14] |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 89 | movq %r13, %xmm3 |
| 90 | movq %r14, %xmm4 |
| 91 | punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) |
| 92 | |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 93 | # 0 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 94 | leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 95 | shrq $32, %rsi |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 96 | movl %ecx, %edi # c |
| 97 | xorl %edx, %edi # ^d |
| 98 | andl %ebx, %edi # &b |
| 99 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 100 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 101 | movl %eax, %edi # |
| 102 | roll $5, %edi # rotl32(a,5) |
| 103 | addl %edi, %ebp # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 104 | rorl $2, %ebx # b = rotl32(b,30) |
| 105 | # 1 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 106 | leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 107 | movl %ebx, %edi # c |
| 108 | xorl %ecx, %edi # ^d |
| 109 | andl %eax, %edi # &b |
| 110 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 111 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 112 | movl %ebp, %edi # |
| 113 | roll $5, %edi # rotl32(a,5) |
| 114 | addl %edi, %edx # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 115 | rorl $2, %eax # b = rotl32(b,30) |
| 116 | # 2 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 117 | leal 0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n] |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 118 | shrq $32, %r8 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 119 | movl %eax, %edi # c |
| 120 | xorl %ebx, %edi # ^d |
| 121 | andl %ebp, %edi # &b |
| 122 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 123 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 124 | movl %edx, %edi # |
| 125 | roll $5, %edi # rotl32(a,5) |
| 126 | addl %edi, %ecx # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 127 | rorl $2, %ebp # b = rotl32(b,30) |
| 128 | # 3 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 129 | leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 130 | movl %ebp, %edi # c |
| 131 | xorl %eax, %edi # ^d |
| 132 | andl %edx, %edi # &b |
| 133 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 134 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 135 | movl %ecx, %edi # |
| 136 | roll $5, %edi # rotl32(a,5) |
| 137 | addl %edi, %ebx # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 138 | rorl $2, %edx # b = rotl32(b,30) |
| 139 | # 4 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 140 | leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 141 | shrq $32, %r9 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 142 | movl %edx, %edi # c |
| 143 | xorl %ebp, %edi # ^d |
| 144 | andl %ecx, %edi # &b |
| 145 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 146 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 147 | movl %ebx, %edi # |
| 148 | roll $5, %edi # rotl32(a,5) |
| 149 | addl %edi, %eax # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 150 | rorl $2, %ecx # b = rotl32(b,30) |
| 151 | # 5 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 152 | leal 0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 153 | movl %ecx, %edi # c |
| 154 | xorl %edx, %edi # ^d |
| 155 | andl %ebx, %edi # &b |
| 156 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 157 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 158 | movl %eax, %edi # |
| 159 | roll $5, %edi # rotl32(a,5) |
| 160 | addl %edi, %ebp # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 161 | rorl $2, %ebx # b = rotl32(b,30) |
| 162 | # 6 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 163 | leal 0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n] |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 164 | shrq $32, %r10 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 165 | movl %ebx, %edi # c |
| 166 | xorl %ecx, %edi # ^d |
| 167 | andl %eax, %edi # &b |
| 168 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 169 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 170 | movl %ebp, %edi # |
| 171 | roll $5, %edi # rotl32(a,5) |
| 172 | addl %edi, %edx # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 173 | rorl $2, %eax # b = rotl32(b,30) |
| 174 | # 7 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 175 | leal 0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 176 | movl %eax, %edi # c |
| 177 | xorl %ebx, %edi # ^d |
| 178 | andl %ebp, %edi # &b |
| 179 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 180 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 181 | movl %edx, %edi # |
| 182 | roll $5, %edi # rotl32(a,5) |
| 183 | addl %edi, %ecx # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 184 | rorl $2, %ebp # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 185 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) |
| 186 | movaps %xmm3, %xmm4 |
| 187 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 188 | # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 189 | # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 190 | # same result as above, but shorter and faster: |
| 191 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 192 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 193 | movaps %xmm0, %xmm5 |
| 194 | shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 195 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 196 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 197 | xorps %xmm5, %xmm0 # ^ |
| 198 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 199 | movaps %xmm0, %xmm5 |
| 200 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 201 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 202 | paddd %xmm0, %xmm0 # shift left by 1 |
| 203 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 204 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 205 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 206 | movaps %xmm5, %xmm4 |
| 207 | pslld $2, %xmm5 |
| 208 | psrld $30, %xmm4 |
| 209 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 210 | xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 |
| 211 | xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 212 | movaps %xmm0, %xmm5 |
| 213 | paddd %xmm6, %xmm5 |
| 214 | movups %xmm5, -64+16*0(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 215 | # 8 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 216 | leal 0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n] |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 217 | shrq $32, %r11 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 218 | movl %ebp, %edi # c |
| 219 | xorl %eax, %edi # ^d |
| 220 | andl %edx, %edi # &b |
| 221 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 222 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 223 | movl %ecx, %edi # |
| 224 | roll $5, %edi # rotl32(a,5) |
| 225 | addl %edi, %ebx # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 226 | rorl $2, %edx # b = rotl32(b,30) |
| 227 | # 9 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 228 | leal 0x5A827999(%rax,%r11), %eax # e += RCONST + W[n] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 229 | movl %edx, %edi # c |
| 230 | xorl %ebp, %edi # ^d |
| 231 | andl %ecx, %edi # &b |
| 232 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 233 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 234 | movl %ebx, %edi # |
| 235 | roll $5, %edi # rotl32(a,5) |
| 236 | addl %edi, %eax # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 237 | rorl $2, %ecx # b = rotl32(b,30) |
| 238 | # 10 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 239 | leal 0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n] |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 240 | shrq $32, %r12 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 241 | movl %ecx, %edi # c |
| 242 | xorl %edx, %edi # ^d |
| 243 | andl %ebx, %edi # &b |
| 244 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 245 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 246 | movl %eax, %edi # |
| 247 | roll $5, %edi # rotl32(a,5) |
| 248 | addl %edi, %ebp # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 249 | rorl $2, %ebx # b = rotl32(b,30) |
| 250 | # 11 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 251 | leal 0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 252 | movl %ebx, %edi # c |
| 253 | xorl %ecx, %edi # ^d |
| 254 | andl %eax, %edi # &b |
| 255 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 256 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 257 | movl %ebp, %edi # |
| 258 | roll $5, %edi # rotl32(a,5) |
| 259 | addl %edi, %edx # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 260 | rorl $2, %eax # b = rotl32(b,30) |
Denys Vlasenko | 4923f74 | 2022-02-08 03:29:16 +0100 | [diff] [blame] | 261 | pshufd $0x55, %xmm7, %xmm6 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 262 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) |
| 263 | movaps %xmm0, %xmm4 |
| 264 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 265 | # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 266 | # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 267 | # same result as above, but shorter and faster: |
| 268 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 269 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 270 | movaps %xmm1, %xmm5 |
| 271 | shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 272 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 273 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 274 | xorps %xmm5, %xmm1 # ^ |
| 275 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 276 | movaps %xmm1, %xmm5 |
| 277 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 278 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 279 | paddd %xmm1, %xmm1 # shift left by 1 |
| 280 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 281 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 282 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 283 | movaps %xmm5, %xmm4 |
| 284 | pslld $2, %xmm5 |
| 285 | psrld $30, %xmm4 |
| 286 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 287 | xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 |
| 288 | xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 289 | movaps %xmm1, %xmm5 |
| 290 | paddd %xmm6, %xmm5 |
| 291 | movups %xmm5, -64+16*1(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 292 | # 12 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 293 | leal 0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n] |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 294 | shrq $32, %r13 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 295 | movl %eax, %edi # c |
| 296 | xorl %ebx, %edi # ^d |
| 297 | andl %ebp, %edi # &b |
| 298 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 299 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 300 | movl %edx, %edi # |
| 301 | roll $5, %edi # rotl32(a,5) |
| 302 | addl %edi, %ecx # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 303 | rorl $2, %ebp # b = rotl32(b,30) |
| 304 | # 13 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 305 | leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 306 | movl %ebp, %edi # c |
| 307 | xorl %eax, %edi # ^d |
| 308 | andl %edx, %edi # &b |
| 309 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 310 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 311 | movl %ecx, %edi # |
| 312 | roll $5, %edi # rotl32(a,5) |
| 313 | addl %edi, %ebx # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 314 | rorl $2, %edx # b = rotl32(b,30) |
| 315 | # 14 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 316 | leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 317 | shrq $32, %r14 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 318 | movl %edx, %edi # c |
| 319 | xorl %ebp, %edi # ^d |
| 320 | andl %ecx, %edi # &b |
| 321 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 322 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 323 | movl %ebx, %edi # |
| 324 | roll $5, %edi # rotl32(a,5) |
| 325 | addl %edi, %eax # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 326 | rorl $2, %ecx # b = rotl32(b,30) |
| 327 | # 15 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 328 | leal 0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 329 | movl %ecx, %edi # c |
| 330 | xorl %edx, %edi # ^d |
| 331 | andl %ebx, %edi # &b |
| 332 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 333 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 334 | movl %eax, %edi # |
| 335 | roll $5, %edi # rotl32(a,5) |
| 336 | addl %edi, %ebp # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 337 | rorl $2, %ebx # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 338 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) |
| 339 | movaps %xmm1, %xmm4 |
| 340 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 341 | # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 342 | # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 343 | # same result as above, but shorter and faster: |
| 344 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 345 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 346 | movaps %xmm2, %xmm5 |
| 347 | shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 348 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 349 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 350 | xorps %xmm5, %xmm2 # ^ |
| 351 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 352 | movaps %xmm2, %xmm5 |
| 353 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 354 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 355 | paddd %xmm2, %xmm2 # shift left by 1 |
| 356 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 357 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 358 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 359 | movaps %xmm5, %xmm4 |
| 360 | pslld $2, %xmm5 |
| 361 | psrld $30, %xmm4 |
| 362 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 363 | xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 |
| 364 | xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 365 | movaps %xmm2, %xmm5 |
| 366 | paddd %xmm6, %xmm5 |
| 367 | movups %xmm5, -64+16*2(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 368 | # 16 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 369 | movl %ebx, %edi # c |
| 370 | xorl %ecx, %edi # ^d |
| 371 | andl %eax, %edi # &b |
| 372 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 373 | addl -64+4*0(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 374 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
| 375 | movl %ebp, %esi # |
| 376 | roll $5, %esi # rotl32(a,5) |
| 377 | addl %esi, %edx # e += rotl32(a,5) |
| 378 | rorl $2, %eax # b = rotl32(b,30) |
| 379 | # 17 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 380 | movl %eax, %edi # c |
| 381 | xorl %ebx, %edi # ^d |
| 382 | andl %ebp, %edi # &b |
| 383 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 384 | addl -64+4*1(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 385 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
| 386 | movl %edx, %esi # |
| 387 | roll $5, %esi # rotl32(a,5) |
| 388 | addl %esi, %ecx # e += rotl32(a,5) |
| 389 | rorl $2, %ebp # b = rotl32(b,30) |
| 390 | # 18 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 391 | movl %ebp, %edi # c |
| 392 | xorl %eax, %edi # ^d |
| 393 | andl %edx, %edi # &b |
| 394 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 395 | addl -64+4*2(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 396 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
| 397 | movl %ecx, %esi # |
| 398 | roll $5, %esi # rotl32(a,5) |
| 399 | addl %esi, %ebx # e += rotl32(a,5) |
| 400 | rorl $2, %edx # b = rotl32(b,30) |
| 401 | # 19 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 402 | movl %edx, %edi # c |
| 403 | xorl %ebp, %edi # ^d |
| 404 | andl %ecx, %edi # &b |
| 405 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 406 | addl -64+4*3(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 407 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
| 408 | movl %ebx, %esi # |
| 409 | roll $5, %esi # rotl32(a,5) |
| 410 | addl %esi, %eax # e += rotl32(a,5) |
| 411 | rorl $2, %ecx # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 412 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) |
| 413 | movaps %xmm2, %xmm4 |
| 414 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 415 | # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 416 | # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 417 | # same result as above, but shorter and faster: |
| 418 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 419 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 420 | movaps %xmm3, %xmm5 |
| 421 | shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 422 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 423 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 424 | xorps %xmm5, %xmm3 # ^ |
| 425 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 426 | movaps %xmm3, %xmm5 |
| 427 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 428 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 429 | paddd %xmm3, %xmm3 # shift left by 1 |
| 430 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 431 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 432 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 433 | movaps %xmm5, %xmm4 |
| 434 | pslld $2, %xmm5 |
| 435 | psrld $30, %xmm4 |
| 436 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 437 | xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 |
| 438 | xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 439 | movaps %xmm3, %xmm5 |
| 440 | paddd %xmm6, %xmm5 |
| 441 | movups %xmm5, -64+16*3(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 442 | # 20 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 443 | movl %ecx, %edi # c |
| 444 | xorl %edx, %edi # ^d |
| 445 | xorl %ebx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 446 | addl -64+4*4(%rsp), %ebp # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 447 | addl %edi, %ebp # e += (c ^ d ^ b) |
| 448 | movl %eax, %esi # |
| 449 | roll $5, %esi # rotl32(a,5) |
| 450 | addl %esi, %ebp # e += rotl32(a,5) |
| 451 | rorl $2, %ebx # b = rotl32(b,30) |
| 452 | # 21 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 453 | movl %ebx, %edi # c |
| 454 | xorl %ecx, %edi # ^d |
| 455 | xorl %eax, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 456 | addl -64+4*5(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 457 | addl %edi, %edx # e += (c ^ d ^ b) |
| 458 | movl %ebp, %esi # |
| 459 | roll $5, %esi # rotl32(a,5) |
| 460 | addl %esi, %edx # e += rotl32(a,5) |
| 461 | rorl $2, %eax # b = rotl32(b,30) |
| 462 | # 22 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 463 | movl %eax, %edi # c |
| 464 | xorl %ebx, %edi # ^d |
| 465 | xorl %ebp, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 466 | addl -64+4*6(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 467 | addl %edi, %ecx # e += (c ^ d ^ b) |
| 468 | movl %edx, %esi # |
| 469 | roll $5, %esi # rotl32(a,5) |
| 470 | addl %esi, %ecx # e += rotl32(a,5) |
| 471 | rorl $2, %ebp # b = rotl32(b,30) |
| 472 | # 23 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 473 | movl %ebp, %edi # c |
| 474 | xorl %eax, %edi # ^d |
| 475 | xorl %edx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 476 | addl -64+4*7(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 477 | addl %edi, %ebx # e += (c ^ d ^ b) |
| 478 | movl %ecx, %esi # |
| 479 | roll $5, %esi # rotl32(a,5) |
| 480 | addl %esi, %ebx # e += rotl32(a,5) |
| 481 | rorl $2, %edx # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 482 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) |
| 483 | movaps %xmm3, %xmm4 |
| 484 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 485 | # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 486 | # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 487 | # same result as above, but shorter and faster: |
| 488 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 489 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 490 | movaps %xmm0, %xmm5 |
| 491 | shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 492 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 493 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 494 | xorps %xmm5, %xmm0 # ^ |
| 495 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 496 | movaps %xmm0, %xmm5 |
| 497 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 498 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 499 | paddd %xmm0, %xmm0 # shift left by 1 |
| 500 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 501 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 502 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 503 | movaps %xmm5, %xmm4 |
| 504 | pslld $2, %xmm5 |
| 505 | psrld $30, %xmm4 |
| 506 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 507 | xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 |
| 508 | xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 509 | movaps %xmm0, %xmm5 |
| 510 | paddd %xmm6, %xmm5 |
| 511 | movups %xmm5, -64+16*0(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 512 | # 24 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 513 | movl %edx, %edi # c |
| 514 | xorl %ebp, %edi # ^d |
| 515 | xorl %ecx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 516 | addl -64+4*8(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 517 | addl %edi, %eax # e += (c ^ d ^ b) |
| 518 | movl %ebx, %esi # |
| 519 | roll $5, %esi # rotl32(a,5) |
| 520 | addl %esi, %eax # e += rotl32(a,5) |
| 521 | rorl $2, %ecx # b = rotl32(b,30) |
| 522 | # 25 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 523 | movl %ecx, %edi # c |
| 524 | xorl %edx, %edi # ^d |
| 525 | xorl %ebx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 526 | addl -64+4*9(%rsp), %ebp # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 527 | addl %edi, %ebp # e += (c ^ d ^ b) |
| 528 | movl %eax, %esi # |
| 529 | roll $5, %esi # rotl32(a,5) |
| 530 | addl %esi, %ebp # e += rotl32(a,5) |
| 531 | rorl $2, %ebx # b = rotl32(b,30) |
| 532 | # 26 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 533 | movl %ebx, %edi # c |
| 534 | xorl %ecx, %edi # ^d |
| 535 | xorl %eax, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 536 | addl -64+4*10(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 537 | addl %edi, %edx # e += (c ^ d ^ b) |
| 538 | movl %ebp, %esi # |
| 539 | roll $5, %esi # rotl32(a,5) |
| 540 | addl %esi, %edx # e += rotl32(a,5) |
| 541 | rorl $2, %eax # b = rotl32(b,30) |
| 542 | # 27 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 543 | movl %eax, %edi # c |
| 544 | xorl %ebx, %edi # ^d |
| 545 | xorl %ebp, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 546 | addl -64+4*11(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 547 | addl %edi, %ecx # e += (c ^ d ^ b) |
| 548 | movl %edx, %esi # |
| 549 | roll $5, %esi # rotl32(a,5) |
| 550 | addl %esi, %ecx # e += rotl32(a,5) |
| 551 | rorl $2, %ebp # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 552 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) |
| 553 | movaps %xmm0, %xmm4 |
| 554 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 555 | # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 556 | # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 557 | # same result as above, but shorter and faster: |
| 558 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 559 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 560 | movaps %xmm1, %xmm5 |
| 561 | shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 562 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 563 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 564 | xorps %xmm5, %xmm1 # ^ |
| 565 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 566 | movaps %xmm1, %xmm5 |
| 567 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 568 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 569 | paddd %xmm1, %xmm1 # shift left by 1 |
| 570 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 571 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 572 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 573 | movaps %xmm5, %xmm4 |
| 574 | pslld $2, %xmm5 |
| 575 | psrld $30, %xmm4 |
| 576 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 577 | xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 |
| 578 | xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 579 | movaps %xmm1, %xmm5 |
| 580 | paddd %xmm6, %xmm5 |
| 581 | movups %xmm5, -64+16*1(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 582 | # 28 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 583 | movl %ebp, %edi # c |
| 584 | xorl %eax, %edi # ^d |
| 585 | xorl %edx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 586 | addl -64+4*12(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 587 | addl %edi, %ebx # e += (c ^ d ^ b) |
| 588 | movl %ecx, %esi # |
| 589 | roll $5, %esi # rotl32(a,5) |
| 590 | addl %esi, %ebx # e += rotl32(a,5) |
| 591 | rorl $2, %edx # b = rotl32(b,30) |
| 592 | # 29 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 593 | movl %edx, %edi # c |
| 594 | xorl %ebp, %edi # ^d |
| 595 | xorl %ecx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 596 | addl -64+4*13(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 597 | addl %edi, %eax # e += (c ^ d ^ b) |
| 598 | movl %ebx, %esi # |
| 599 | roll $5, %esi # rotl32(a,5) |
| 600 | addl %esi, %eax # e += rotl32(a,5) |
| 601 | rorl $2, %ecx # b = rotl32(b,30) |
| 602 | # 30 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 603 | movl %ecx, %edi # c |
| 604 | xorl %edx, %edi # ^d |
| 605 | xorl %ebx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 606 | addl -64+4*14(%rsp), %ebp # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 607 | addl %edi, %ebp # e += (c ^ d ^ b) |
| 608 | movl %eax, %esi # |
| 609 | roll $5, %esi # rotl32(a,5) |
| 610 | addl %esi, %ebp # e += rotl32(a,5) |
| 611 | rorl $2, %ebx # b = rotl32(b,30) |
| 612 | # 31 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 613 | movl %ebx, %edi # c |
| 614 | xorl %ecx, %edi # ^d |
| 615 | xorl %eax, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 616 | addl -64+4*15(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 617 | addl %edi, %edx # e += (c ^ d ^ b) |
| 618 | movl %ebp, %esi # |
| 619 | roll $5, %esi # rotl32(a,5) |
| 620 | addl %esi, %edx # e += rotl32(a,5) |
| 621 | rorl $2, %eax # b = rotl32(b,30) |
Denys Vlasenko | 4923f74 | 2022-02-08 03:29:16 +0100 | [diff] [blame] | 622 | pshufd $0xaa, %xmm7, %xmm6 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 623 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) |
| 624 | movaps %xmm1, %xmm4 |
| 625 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 626 | # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 627 | # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 628 | # same result as above, but shorter and faster: |
| 629 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 630 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 631 | movaps %xmm2, %xmm5 |
| 632 | shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 633 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 634 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 635 | xorps %xmm5, %xmm2 # ^ |
| 636 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 637 | movaps %xmm2, %xmm5 |
| 638 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 639 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 640 | paddd %xmm2, %xmm2 # shift left by 1 |
| 641 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 642 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 643 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 644 | movaps %xmm5, %xmm4 |
| 645 | pslld $2, %xmm5 |
| 646 | psrld $30, %xmm4 |
| 647 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 648 | xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 |
| 649 | xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 650 | movaps %xmm2, %xmm5 |
| 651 | paddd %xmm6, %xmm5 |
| 652 | movups %xmm5, -64+16*2(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 653 | # 32 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 654 | movl %eax, %edi # c |
| 655 | xorl %ebx, %edi # ^d |
| 656 | xorl %ebp, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 657 | addl -64+4*0(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 658 | addl %edi, %ecx # e += (c ^ d ^ b) |
| 659 | movl %edx, %esi # |
| 660 | roll $5, %esi # rotl32(a,5) |
| 661 | addl %esi, %ecx # e += rotl32(a,5) |
| 662 | rorl $2, %ebp # b = rotl32(b,30) |
| 663 | # 33 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 664 | movl %ebp, %edi # c |
| 665 | xorl %eax, %edi # ^d |
| 666 | xorl %edx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 667 | addl -64+4*1(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 668 | addl %edi, %ebx # e += (c ^ d ^ b) |
| 669 | movl %ecx, %esi # |
| 670 | roll $5, %esi # rotl32(a,5) |
| 671 | addl %esi, %ebx # e += rotl32(a,5) |
| 672 | rorl $2, %edx # b = rotl32(b,30) |
| 673 | # 34 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 674 | movl %edx, %edi # c |
| 675 | xorl %ebp, %edi # ^d |
| 676 | xorl %ecx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 677 | addl -64+4*2(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 678 | addl %edi, %eax # e += (c ^ d ^ b) |
| 679 | movl %ebx, %esi # |
| 680 | roll $5, %esi # rotl32(a,5) |
| 681 | addl %esi, %eax # e += rotl32(a,5) |
| 682 | rorl $2, %ecx # b = rotl32(b,30) |
| 683 | # 35 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 684 | movl %ecx, %edi # c |
| 685 | xorl %edx, %edi # ^d |
| 686 | xorl %ebx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 687 | addl -64+4*3(%rsp), %ebp # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 688 | addl %edi, %ebp # e += (c ^ d ^ b) |
| 689 | movl %eax, %esi # |
| 690 | roll $5, %esi # rotl32(a,5) |
| 691 | addl %esi, %ebp # e += rotl32(a,5) |
| 692 | rorl $2, %ebx # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 693 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) |
| 694 | movaps %xmm2, %xmm4 |
| 695 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 696 | # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 697 | # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 698 | # same result as above, but shorter and faster: |
| 699 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 700 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 701 | movaps %xmm3, %xmm5 |
| 702 | shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 703 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 704 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 705 | xorps %xmm5, %xmm3 # ^ |
| 706 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 707 | movaps %xmm3, %xmm5 |
| 708 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 709 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 710 | paddd %xmm3, %xmm3 # shift left by 1 |
| 711 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 712 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 713 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 714 | movaps %xmm5, %xmm4 |
| 715 | pslld $2, %xmm5 |
| 716 | psrld $30, %xmm4 |
| 717 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 718 | xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 |
| 719 | xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 720 | movaps %xmm3, %xmm5 |
| 721 | paddd %xmm6, %xmm5 |
| 722 | movups %xmm5, -64+16*3(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 723 | # 36 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 724 | movl %ebx, %edi # c |
| 725 | xorl %ecx, %edi # ^d |
| 726 | xorl %eax, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 727 | addl -64+4*4(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 728 | addl %edi, %edx # e += (c ^ d ^ b) |
| 729 | movl %ebp, %esi # |
| 730 | roll $5, %esi # rotl32(a,5) |
| 731 | addl %esi, %edx # e += rotl32(a,5) |
| 732 | rorl $2, %eax # b = rotl32(b,30) |
| 733 | # 37 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 734 | movl %eax, %edi # c |
| 735 | xorl %ebx, %edi # ^d |
| 736 | xorl %ebp, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 737 | addl -64+4*5(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 738 | addl %edi, %ecx # e += (c ^ d ^ b) |
| 739 | movl %edx, %esi # |
| 740 | roll $5, %esi # rotl32(a,5) |
| 741 | addl %esi, %ecx # e += rotl32(a,5) |
| 742 | rorl $2, %ebp # b = rotl32(b,30) |
| 743 | # 38 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 744 | movl %ebp, %edi # c |
| 745 | xorl %eax, %edi # ^d |
| 746 | xorl %edx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 747 | addl -64+4*6(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 748 | addl %edi, %ebx # e += (c ^ d ^ b) |
| 749 | movl %ecx, %esi # |
| 750 | roll $5, %esi # rotl32(a,5) |
| 751 | addl %esi, %ebx # e += rotl32(a,5) |
| 752 | rorl $2, %edx # b = rotl32(b,30) |
| 753 | # 39 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 754 | movl %edx, %edi # c |
| 755 | xorl %ebp, %edi # ^d |
| 756 | xorl %ecx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 757 | addl -64+4*7(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 758 | addl %edi, %eax # e += (c ^ d ^ b) |
| 759 | movl %ebx, %esi # |
| 760 | roll $5, %esi # rotl32(a,5) |
| 761 | addl %esi, %eax # e += rotl32(a,5) |
| 762 | rorl $2, %ecx # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 763 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) |
| 764 | movaps %xmm3, %xmm4 |
| 765 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 766 | # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 767 | # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 768 | # same result as above, but shorter and faster: |
| 769 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 770 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 771 | movaps %xmm0, %xmm5 |
| 772 | shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 773 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 774 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 775 | xorps %xmm5, %xmm0 # ^ |
| 776 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 777 | movaps %xmm0, %xmm5 |
| 778 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 779 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 780 | paddd %xmm0, %xmm0 # shift left by 1 |
| 781 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 782 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 783 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 784 | movaps %xmm5, %xmm4 |
| 785 | pslld $2, %xmm5 |
| 786 | psrld $30, %xmm4 |
| 787 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 788 | xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 |
| 789 | xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 790 | movaps %xmm0, %xmm5 |
| 791 | paddd %xmm6, %xmm5 |
| 792 | movups %xmm5, -64+16*0(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 793 | # 40 |
| 794 | movl %ebx, %edi # di: b |
| 795 | movl %ebx, %esi # si: b |
| 796 | orl %ecx, %edi # di: b | c |
| 797 | andl %ecx, %esi # si: b & c |
| 798 | andl %edx, %edi # di: (b | c) & d |
| 799 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 800 | addl %edi, %ebp # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 801 | addl -64+4*8(%rsp), %ebp # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 802 | movl %eax, %esi # |
| 803 | roll $5, %esi # rotl32(a,5) |
| 804 | addl %esi, %ebp # e += rotl32(a,5) |
| 805 | rorl $2, %ebx # b = rotl32(b,30) |
| 806 | # 41 |
| 807 | movl %eax, %edi # di: b |
| 808 | movl %eax, %esi # si: b |
| 809 | orl %ebx, %edi # di: b | c |
| 810 | andl %ebx, %esi # si: b & c |
| 811 | andl %ecx, %edi # di: (b | c) & d |
| 812 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 813 | addl %edi, %edx # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 814 | addl -64+4*9(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 815 | movl %ebp, %esi # |
| 816 | roll $5, %esi # rotl32(a,5) |
| 817 | addl %esi, %edx # e += rotl32(a,5) |
| 818 | rorl $2, %eax # b = rotl32(b,30) |
| 819 | # 42 |
| 820 | movl %ebp, %edi # di: b |
| 821 | movl %ebp, %esi # si: b |
| 822 | orl %eax, %edi # di: b | c |
| 823 | andl %eax, %esi # si: b & c |
| 824 | andl %ebx, %edi # di: (b | c) & d |
| 825 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 826 | addl %edi, %ecx # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 827 | addl -64+4*10(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 828 | movl %edx, %esi # |
| 829 | roll $5, %esi # rotl32(a,5) |
| 830 | addl %esi, %ecx # e += rotl32(a,5) |
| 831 | rorl $2, %ebp # b = rotl32(b,30) |
| 832 | # 43 |
| 833 | movl %edx, %edi # di: b |
| 834 | movl %edx, %esi # si: b |
| 835 | orl %ebp, %edi # di: b | c |
| 836 | andl %ebp, %esi # si: b & c |
| 837 | andl %eax, %edi # di: (b | c) & d |
| 838 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 839 | addl %edi, %ebx # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 840 | addl -64+4*11(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 841 | movl %ecx, %esi # |
| 842 | roll $5, %esi # rotl32(a,5) |
| 843 | addl %esi, %ebx # e += rotl32(a,5) |
| 844 | rorl $2, %edx # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 845 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) |
| 846 | movaps %xmm0, %xmm4 |
| 847 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 848 | # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 849 | # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 850 | # same result as above, but shorter and faster: |
| 851 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 852 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 853 | movaps %xmm1, %xmm5 |
| 854 | shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 855 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 856 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 857 | xorps %xmm5, %xmm1 # ^ |
| 858 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 859 | movaps %xmm1, %xmm5 |
| 860 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 861 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 862 | paddd %xmm1, %xmm1 # shift left by 1 |
| 863 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 864 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 865 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 866 | movaps %xmm5, %xmm4 |
| 867 | pslld $2, %xmm5 |
| 868 | psrld $30, %xmm4 |
| 869 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 870 | xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 |
| 871 | xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 872 | movaps %xmm1, %xmm5 |
| 873 | paddd %xmm6, %xmm5 |
| 874 | movups %xmm5, -64+16*1(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 875 | # 44 |
| 876 | movl %ecx, %edi # di: b |
| 877 | movl %ecx, %esi # si: b |
| 878 | orl %edx, %edi # di: b | c |
| 879 | andl %edx, %esi # si: b & c |
| 880 | andl %ebp, %edi # di: (b | c) & d |
| 881 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 882 | addl %edi, %eax # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 883 | addl -64+4*12(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 884 | movl %ebx, %esi # |
| 885 | roll $5, %esi # rotl32(a,5) |
| 886 | addl %esi, %eax # e += rotl32(a,5) |
| 887 | rorl $2, %ecx # b = rotl32(b,30) |
| 888 | # 45 |
| 889 | movl %ebx, %edi # di: b |
| 890 | movl %ebx, %esi # si: b |
| 891 | orl %ecx, %edi # di: b | c |
| 892 | andl %ecx, %esi # si: b & c |
| 893 | andl %edx, %edi # di: (b | c) & d |
| 894 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 895 | addl %edi, %ebp # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 896 | addl -64+4*13(%rsp), %ebp # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 897 | movl %eax, %esi # |
| 898 | roll $5, %esi # rotl32(a,5) |
| 899 | addl %esi, %ebp # e += rotl32(a,5) |
| 900 | rorl $2, %ebx # b = rotl32(b,30) |
| 901 | # 46 |
| 902 | movl %eax, %edi # di: b |
| 903 | movl %eax, %esi # si: b |
| 904 | orl %ebx, %edi # di: b | c |
| 905 | andl %ebx, %esi # si: b & c |
| 906 | andl %ecx, %edi # di: (b | c) & d |
| 907 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 908 | addl %edi, %edx # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 909 | addl -64+4*14(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 910 | movl %ebp, %esi # |
| 911 | roll $5, %esi # rotl32(a,5) |
| 912 | addl %esi, %edx # e += rotl32(a,5) |
| 913 | rorl $2, %eax # b = rotl32(b,30) |
| 914 | # 47 |
| 915 | movl %ebp, %edi # di: b |
| 916 | movl %ebp, %esi # si: b |
| 917 | orl %eax, %edi # di: b | c |
| 918 | andl %eax, %esi # si: b & c |
| 919 | andl %ebx, %edi # di: (b | c) & d |
| 920 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 921 | addl %edi, %ecx # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 922 | addl -64+4*15(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 923 | movl %edx, %esi # |
| 924 | roll $5, %esi # rotl32(a,5) |
| 925 | addl %esi, %ecx # e += rotl32(a,5) |
| 926 | rorl $2, %ebp # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 927 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) |
| 928 | movaps %xmm1, %xmm4 |
| 929 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 930 | # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 931 | # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 932 | # same result as above, but shorter and faster: |
| 933 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 934 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 935 | movaps %xmm2, %xmm5 |
| 936 | shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 937 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 938 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 939 | xorps %xmm5, %xmm2 # ^ |
| 940 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 941 | movaps %xmm2, %xmm5 |
| 942 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 943 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 944 | paddd %xmm2, %xmm2 # shift left by 1 |
| 945 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 946 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 947 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 948 | movaps %xmm5, %xmm4 |
| 949 | pslld $2, %xmm5 |
| 950 | psrld $30, %xmm4 |
| 951 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 952 | xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 |
| 953 | xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 954 | movaps %xmm2, %xmm5 |
| 955 | paddd %xmm6, %xmm5 |
| 956 | movups %xmm5, -64+16*2(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 957 | # 48 |
| 958 | movl %edx, %edi # di: b |
| 959 | movl %edx, %esi # si: b |
| 960 | orl %ebp, %edi # di: b | c |
| 961 | andl %ebp, %esi # si: b & c |
| 962 | andl %eax, %edi # di: (b | c) & d |
| 963 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 964 | addl %edi, %ebx # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 965 | addl -64+4*0(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 966 | movl %ecx, %esi # |
| 967 | roll $5, %esi # rotl32(a,5) |
| 968 | addl %esi, %ebx # e += rotl32(a,5) |
| 969 | rorl $2, %edx # b = rotl32(b,30) |
| 970 | # 49 |
| 971 | movl %ecx, %edi # di: b |
| 972 | movl %ecx, %esi # si: b |
| 973 | orl %edx, %edi # di: b | c |
| 974 | andl %edx, %esi # si: b & c |
| 975 | andl %ebp, %edi # di: (b | c) & d |
| 976 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 977 | addl %edi, %eax # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 978 | addl -64+4*1(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 979 | movl %ebx, %esi # |
| 980 | roll $5, %esi # rotl32(a,5) |
| 981 | addl %esi, %eax # e += rotl32(a,5) |
| 982 | rorl $2, %ecx # b = rotl32(b,30) |
| 983 | # 50 |
| 984 | movl %ebx, %edi # di: b |
| 985 | movl %ebx, %esi # si: b |
| 986 | orl %ecx, %edi # di: b | c |
| 987 | andl %ecx, %esi # si: b & c |
| 988 | andl %edx, %edi # di: (b | c) & d |
| 989 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 990 | addl %edi, %ebp # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 991 | addl -64+4*2(%rsp), %ebp # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 992 | movl %eax, %esi # |
| 993 | roll $5, %esi # rotl32(a,5) |
| 994 | addl %esi, %ebp # e += rotl32(a,5) |
| 995 | rorl $2, %ebx # b = rotl32(b,30) |
| 996 | # 51 |
| 997 | movl %eax, %edi # di: b |
| 998 | movl %eax, %esi # si: b |
| 999 | orl %ebx, %edi # di: b | c |
| 1000 | andl %ebx, %esi # si: b & c |
| 1001 | andl %ecx, %edi # di: (b | c) & d |
| 1002 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1003 | addl %edi, %edx # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1004 | addl -64+4*3(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1005 | movl %ebp, %esi # |
| 1006 | roll $5, %esi # rotl32(a,5) |
| 1007 | addl %esi, %edx # e += rotl32(a,5) |
| 1008 | rorl $2, %eax # b = rotl32(b,30) |
Denys Vlasenko | 4923f74 | 2022-02-08 03:29:16 +0100 | [diff] [blame] | 1009 | pshufd $0xff, %xmm7, %xmm6 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1010 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) |
| 1011 | movaps %xmm2, %xmm4 |
| 1012 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 1013 | # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 1014 | # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 1015 | # same result as above, but shorter and faster: |
| 1016 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 1017 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 1018 | movaps %xmm3, %xmm5 |
| 1019 | shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1020 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 1021 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 1022 | xorps %xmm5, %xmm3 # ^ |
| 1023 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 1024 | movaps %xmm3, %xmm5 |
| 1025 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 1026 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 1027 | paddd %xmm3, %xmm3 # shift left by 1 |
| 1028 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1029 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 1030 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 1031 | movaps %xmm5, %xmm4 |
| 1032 | pslld $2, %xmm5 |
| 1033 | psrld $30, %xmm4 |
| 1034 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 1035 | xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 |
| 1036 | xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 1037 | movaps %xmm3, %xmm5 |
| 1038 | paddd %xmm6, %xmm5 |
| 1039 | movups %xmm5, -64+16*3(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1040 | # 52 |
| 1041 | movl %ebp, %edi # di: b |
| 1042 | movl %ebp, %esi # si: b |
| 1043 | orl %eax, %edi # di: b | c |
| 1044 | andl %eax, %esi # si: b & c |
| 1045 | andl %ebx, %edi # di: (b | c) & d |
| 1046 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1047 | addl %edi, %ecx # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1048 | addl -64+4*4(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1049 | movl %edx, %esi # |
| 1050 | roll $5, %esi # rotl32(a,5) |
| 1051 | addl %esi, %ecx # e += rotl32(a,5) |
| 1052 | rorl $2, %ebp # b = rotl32(b,30) |
| 1053 | # 53 |
| 1054 | movl %edx, %edi # di: b |
| 1055 | movl %edx, %esi # si: b |
| 1056 | orl %ebp, %edi # di: b | c |
| 1057 | andl %ebp, %esi # si: b & c |
| 1058 | andl %eax, %edi # di: (b | c) & d |
| 1059 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1060 | addl %edi, %ebx # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1061 | addl -64+4*5(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1062 | movl %ecx, %esi # |
| 1063 | roll $5, %esi # rotl32(a,5) |
| 1064 | addl %esi, %ebx # e += rotl32(a,5) |
| 1065 | rorl $2, %edx # b = rotl32(b,30) |
| 1066 | # 54 |
| 1067 | movl %ecx, %edi # di: b |
| 1068 | movl %ecx, %esi # si: b |
| 1069 | orl %edx, %edi # di: b | c |
| 1070 | andl %edx, %esi # si: b & c |
| 1071 | andl %ebp, %edi # di: (b | c) & d |
| 1072 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1073 | addl %edi, %eax # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1074 | addl -64+4*6(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1075 | movl %ebx, %esi # |
| 1076 | roll $5, %esi # rotl32(a,5) |
| 1077 | addl %esi, %eax # e += rotl32(a,5) |
| 1078 | rorl $2, %ecx # b = rotl32(b,30) |
| 1079 | # 55 |
| 1080 | movl %ebx, %edi # di: b |
| 1081 | movl %ebx, %esi # si: b |
| 1082 | orl %ecx, %edi # di: b | c |
| 1083 | andl %ecx, %esi # si: b & c |
| 1084 | andl %edx, %edi # di: (b | c) & d |
| 1085 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1086 | addl %edi, %ebp # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1087 | addl -64+4*7(%rsp), %ebp # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1088 | movl %eax, %esi # |
| 1089 | roll $5, %esi # rotl32(a,5) |
| 1090 | addl %esi, %ebp # e += rotl32(a,5) |
| 1091 | rorl $2, %ebx # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1092 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) |
| 1093 | movaps %xmm3, %xmm4 |
| 1094 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 1095 | # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 1096 | # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 1097 | # same result as above, but shorter and faster: |
| 1098 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 1099 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 1100 | movaps %xmm0, %xmm5 |
| 1101 | shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1102 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 1103 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 1104 | xorps %xmm5, %xmm0 # ^ |
| 1105 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 1106 | movaps %xmm0, %xmm5 |
| 1107 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 1108 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 1109 | paddd %xmm0, %xmm0 # shift left by 1 |
| 1110 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1111 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 1112 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 1113 | movaps %xmm5, %xmm4 |
| 1114 | pslld $2, %xmm5 |
| 1115 | psrld $30, %xmm4 |
| 1116 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 1117 | xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 |
| 1118 | xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 1119 | movaps %xmm0, %xmm5 |
| 1120 | paddd %xmm6, %xmm5 |
| 1121 | movups %xmm5, -64+16*0(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1122 | # 56 |
| 1123 | movl %eax, %edi # di: b |
| 1124 | movl %eax, %esi # si: b |
| 1125 | orl %ebx, %edi # di: b | c |
| 1126 | andl %ebx, %esi # si: b & c |
| 1127 | andl %ecx, %edi # di: (b | c) & d |
| 1128 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1129 | addl %edi, %edx # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1130 | addl -64+4*8(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1131 | movl %ebp, %esi # |
| 1132 | roll $5, %esi # rotl32(a,5) |
| 1133 | addl %esi, %edx # e += rotl32(a,5) |
| 1134 | rorl $2, %eax # b = rotl32(b,30) |
| 1135 | # 57 |
| 1136 | movl %ebp, %edi # di: b |
| 1137 | movl %ebp, %esi # si: b |
| 1138 | orl %eax, %edi # di: b | c |
| 1139 | andl %eax, %esi # si: b & c |
| 1140 | andl %ebx, %edi # di: (b | c) & d |
| 1141 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1142 | addl %edi, %ecx # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1143 | addl -64+4*9(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1144 | movl %edx, %esi # |
| 1145 | roll $5, %esi # rotl32(a,5) |
| 1146 | addl %esi, %ecx # e += rotl32(a,5) |
| 1147 | rorl $2, %ebp # b = rotl32(b,30) |
| 1148 | # 58 |
| 1149 | movl %edx, %edi # di: b |
| 1150 | movl %edx, %esi # si: b |
| 1151 | orl %ebp, %edi # di: b | c |
| 1152 | andl %ebp, %esi # si: b & c |
| 1153 | andl %eax, %edi # di: (b | c) & d |
| 1154 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1155 | addl %edi, %ebx # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1156 | addl -64+4*10(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1157 | movl %ecx, %esi # |
| 1158 | roll $5, %esi # rotl32(a,5) |
| 1159 | addl %esi, %ebx # e += rotl32(a,5) |
| 1160 | rorl $2, %edx # b = rotl32(b,30) |
| 1161 | # 59 |
| 1162 | movl %ecx, %edi # di: b |
| 1163 | movl %ecx, %esi # si: b |
| 1164 | orl %edx, %edi # di: b | c |
| 1165 | andl %edx, %esi # si: b & c |
| 1166 | andl %ebp, %edi # di: (b | c) & d |
| 1167 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1168 | addl %edi, %eax # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1169 | addl -64+4*11(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1170 | movl %ebx, %esi # |
| 1171 | roll $5, %esi # rotl32(a,5) |
| 1172 | addl %esi, %eax # e += rotl32(a,5) |
| 1173 | rorl $2, %ecx # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1174 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) |
| 1175 | movaps %xmm0, %xmm4 |
| 1176 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 1177 | # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 1178 | # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 1179 | # same result as above, but shorter and faster: |
| 1180 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 1181 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 1182 | movaps %xmm1, %xmm5 |
| 1183 | shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1184 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 1185 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 1186 | xorps %xmm5, %xmm1 # ^ |
| 1187 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 1188 | movaps %xmm1, %xmm5 |
| 1189 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 1190 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 1191 | paddd %xmm1, %xmm1 # shift left by 1 |
| 1192 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1193 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 1194 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 1195 | movaps %xmm5, %xmm4 |
| 1196 | pslld $2, %xmm5 |
| 1197 | psrld $30, %xmm4 |
| 1198 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 1199 | xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 |
| 1200 | xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 1201 | movaps %xmm1, %xmm5 |
| 1202 | paddd %xmm6, %xmm5 |
| 1203 | movups %xmm5, -64+16*1(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1204 | # 60 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1205 | movl %ecx, %edi # c |
| 1206 | xorl %edx, %edi # ^d |
| 1207 | xorl %ebx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1208 | addl -64+4*12(%rsp), %ebp # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1209 | addl %edi, %ebp # e += (c ^ d ^ b) |
| 1210 | movl %eax, %esi # |
| 1211 | roll $5, %esi # rotl32(a,5) |
| 1212 | addl %esi, %ebp # e += rotl32(a,5) |
| 1213 | rorl $2, %ebx # b = rotl32(b,30) |
| 1214 | # 61 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1215 | movl %ebx, %edi # c |
| 1216 | xorl %ecx, %edi # ^d |
| 1217 | xorl %eax, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1218 | addl -64+4*13(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1219 | addl %edi, %edx # e += (c ^ d ^ b) |
| 1220 | movl %ebp, %esi # |
| 1221 | roll $5, %esi # rotl32(a,5) |
| 1222 | addl %esi, %edx # e += rotl32(a,5) |
| 1223 | rorl $2, %eax # b = rotl32(b,30) |
| 1224 | # 62 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1225 | movl %eax, %edi # c |
| 1226 | xorl %ebx, %edi # ^d |
| 1227 | xorl %ebp, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1228 | addl -64+4*14(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1229 | addl %edi, %ecx # e += (c ^ d ^ b) |
| 1230 | movl %edx, %esi # |
| 1231 | roll $5, %esi # rotl32(a,5) |
| 1232 | addl %esi, %ecx # e += rotl32(a,5) |
| 1233 | rorl $2, %ebp # b = rotl32(b,30) |
| 1234 | # 63 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1235 | movl %ebp, %edi # c |
| 1236 | xorl %eax, %edi # ^d |
| 1237 | xorl %edx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1238 | addl -64+4*15(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1239 | addl %edi, %ebx # e += (c ^ d ^ b) |
| 1240 | movl %ecx, %esi # |
| 1241 | roll $5, %esi # rotl32(a,5) |
| 1242 | addl %esi, %ebx # e += rotl32(a,5) |
| 1243 | rorl $2, %edx # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1244 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) |
| 1245 | movaps %xmm1, %xmm4 |
| 1246 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 1247 | # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 1248 | # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 1249 | # same result as above, but shorter and faster: |
| 1250 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 1251 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 1252 | movaps %xmm2, %xmm5 |
| 1253 | shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1254 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 1255 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 1256 | xorps %xmm5, %xmm2 # ^ |
| 1257 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 1258 | movaps %xmm2, %xmm5 |
| 1259 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 1260 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 1261 | paddd %xmm2, %xmm2 # shift left by 1 |
| 1262 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1263 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 1264 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 1265 | movaps %xmm5, %xmm4 |
| 1266 | pslld $2, %xmm5 |
| 1267 | psrld $30, %xmm4 |
| 1268 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 1269 | xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 |
| 1270 | xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 1271 | movaps %xmm2, %xmm5 |
| 1272 | paddd %xmm6, %xmm5 |
| 1273 | movups %xmm5, -64+16*2(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1274 | # 64 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1275 | movl %edx, %edi # c |
| 1276 | xorl %ebp, %edi # ^d |
| 1277 | xorl %ecx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1278 | addl -64+4*0(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1279 | addl %edi, %eax # e += (c ^ d ^ b) |
| 1280 | movl %ebx, %esi # |
| 1281 | roll $5, %esi # rotl32(a,5) |
| 1282 | addl %esi, %eax # e += rotl32(a,5) |
| 1283 | rorl $2, %ecx # b = rotl32(b,30) |
| 1284 | # 65 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1285 | movl %ecx, %edi # c |
| 1286 | xorl %edx, %edi # ^d |
| 1287 | xorl %ebx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1288 | addl -64+4*1(%rsp), %ebp # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1289 | addl %edi, %ebp # e += (c ^ d ^ b) |
| 1290 | movl %eax, %esi # |
| 1291 | roll $5, %esi # rotl32(a,5) |
| 1292 | addl %esi, %ebp # e += rotl32(a,5) |
| 1293 | rorl $2, %ebx # b = rotl32(b,30) |
| 1294 | # 66 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1295 | movl %ebx, %edi # c |
| 1296 | xorl %ecx, %edi # ^d |
| 1297 | xorl %eax, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1298 | addl -64+4*2(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1299 | addl %edi, %edx # e += (c ^ d ^ b) |
| 1300 | movl %ebp, %esi # |
| 1301 | roll $5, %esi # rotl32(a,5) |
| 1302 | addl %esi, %edx # e += rotl32(a,5) |
| 1303 | rorl $2, %eax # b = rotl32(b,30) |
| 1304 | # 67 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1305 | movl %eax, %edi # c |
| 1306 | xorl %ebx, %edi # ^d |
| 1307 | xorl %ebp, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1308 | addl -64+4*3(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1309 | addl %edi, %ecx # e += (c ^ d ^ b) |
| 1310 | movl %edx, %esi # |
| 1311 | roll $5, %esi # rotl32(a,5) |
| 1312 | addl %esi, %ecx # e += rotl32(a,5) |
| 1313 | rorl $2, %ebp # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1314 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) |
| 1315 | movaps %xmm2, %xmm4 |
| 1316 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 1317 | # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 1318 | # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 1319 | # same result as above, but shorter and faster: |
| 1320 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 1321 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 1322 | movaps %xmm3, %xmm5 |
| 1323 | shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1324 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 1325 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 1326 | xorps %xmm5, %xmm3 # ^ |
| 1327 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 1328 | movaps %xmm3, %xmm5 |
| 1329 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 1330 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 1331 | paddd %xmm3, %xmm3 # shift left by 1 |
| 1332 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1333 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 1334 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 1335 | movaps %xmm5, %xmm4 |
| 1336 | pslld $2, %xmm5 |
| 1337 | psrld $30, %xmm4 |
| 1338 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 1339 | xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 |
| 1340 | xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 1341 | movaps %xmm3, %xmm5 |
| 1342 | paddd %xmm6, %xmm5 |
| 1343 | movups %xmm5, -64+16*3(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1344 | # 68 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1345 | movl %ebp, %edi # c |
| 1346 | xorl %eax, %edi # ^d |
| 1347 | xorl %edx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1348 | addl -64+4*4(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1349 | addl %edi, %ebx # e += (c ^ d ^ b) |
| 1350 | movl %ecx, %esi # |
| 1351 | roll $5, %esi # rotl32(a,5) |
| 1352 | addl %esi, %ebx # e += rotl32(a,5) |
| 1353 | rorl $2, %edx # b = rotl32(b,30) |
| 1354 | # 69 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1355 | movl %edx, %edi # c |
| 1356 | xorl %ebp, %edi # ^d |
| 1357 | xorl %ecx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1358 | addl -64+4*5(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1359 | addl %edi, %eax # e += (c ^ d ^ b) |
| 1360 | movl %ebx, %esi # |
| 1361 | roll $5, %esi # rotl32(a,5) |
| 1362 | addl %esi, %eax # e += rotl32(a,5) |
| 1363 | rorl $2, %ecx # b = rotl32(b,30) |
| 1364 | # 70 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1365 | movl %ecx, %edi # c |
| 1366 | xorl %edx, %edi # ^d |
| 1367 | xorl %ebx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1368 | addl -64+4*6(%rsp), %ebp # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1369 | addl %edi, %ebp # e += (c ^ d ^ b) |
| 1370 | movl %eax, %esi # |
| 1371 | roll $5, %esi # rotl32(a,5) |
| 1372 | addl %esi, %ebp # e += rotl32(a,5) |
| 1373 | rorl $2, %ebx # b = rotl32(b,30) |
| 1374 | # 71 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1375 | movl %ebx, %edi # c |
| 1376 | xorl %ecx, %edi # ^d |
| 1377 | xorl %eax, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1378 | addl -64+4*7(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1379 | addl %edi, %edx # e += (c ^ d ^ b) |
| 1380 | movl %ebp, %esi # |
| 1381 | roll $5, %esi # rotl32(a,5) |
| 1382 | addl %esi, %edx # e += rotl32(a,5) |
| 1383 | rorl $2, %eax # b = rotl32(b,30) |
| 1384 | # 72 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1385 | movl %eax, %edi # c |
| 1386 | xorl %ebx, %edi # ^d |
| 1387 | xorl %ebp, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1388 | addl -64+4*8(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1389 | addl %edi, %ecx # e += (c ^ d ^ b) |
| 1390 | movl %edx, %esi # |
| 1391 | roll $5, %esi # rotl32(a,5) |
| 1392 | addl %esi, %ecx # e += rotl32(a,5) |
| 1393 | rorl $2, %ebp # b = rotl32(b,30) |
| 1394 | # 73 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1395 | movl %ebp, %edi # c |
| 1396 | xorl %eax, %edi # ^d |
| 1397 | xorl %edx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1398 | addl -64+4*9(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1399 | addl %edi, %ebx # e += (c ^ d ^ b) |
| 1400 | movl %ecx, %esi # |
| 1401 | roll $5, %esi # rotl32(a,5) |
| 1402 | addl %esi, %ebx # e += rotl32(a,5) |
| 1403 | rorl $2, %edx # b = rotl32(b,30) |
| 1404 | # 74 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1405 | movl %edx, %edi # c |
| 1406 | xorl %ebp, %edi # ^d |
| 1407 | xorl %ecx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1408 | addl -64+4*10(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1409 | addl %edi, %eax # e += (c ^ d ^ b) |
| 1410 | movl %ebx, %esi # |
| 1411 | roll $5, %esi # rotl32(a,5) |
| 1412 | addl %esi, %eax # e += rotl32(a,5) |
| 1413 | rorl $2, %ecx # b = rotl32(b,30) |
| 1414 | # 75 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1415 | movl %ecx, %edi # c |
| 1416 | xorl %edx, %edi # ^d |
| 1417 | xorl %ebx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1418 | addl -64+4*11(%rsp), %ebp # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1419 | addl %edi, %ebp # e += (c ^ d ^ b) |
| 1420 | movl %eax, %esi # |
| 1421 | roll $5, %esi # rotl32(a,5) |
| 1422 | addl %esi, %ebp # e += rotl32(a,5) |
| 1423 | rorl $2, %ebx # b = rotl32(b,30) |
| 1424 | # 76 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1425 | movl %ebx, %edi # c |
| 1426 | xorl %ecx, %edi # ^d |
| 1427 | xorl %eax, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1428 | addl -64+4*12(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1429 | addl %edi, %edx # e += (c ^ d ^ b) |
| 1430 | movl %ebp, %esi # |
| 1431 | roll $5, %esi # rotl32(a,5) |
| 1432 | addl %esi, %edx # e += rotl32(a,5) |
| 1433 | rorl $2, %eax # b = rotl32(b,30) |
| 1434 | # 77 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1435 | movl %eax, %edi # c |
| 1436 | xorl %ebx, %edi # ^d |
| 1437 | xorl %ebp, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1438 | addl -64+4*13(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1439 | addl %edi, %ecx # e += (c ^ d ^ b) |
| 1440 | movl %edx, %esi # |
| 1441 | roll $5, %esi # rotl32(a,5) |
| 1442 | addl %esi, %ecx # e += rotl32(a,5) |
| 1443 | rorl $2, %ebp # b = rotl32(b,30) |
| 1444 | # 78 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1445 | movl %ebp, %edi # c |
| 1446 | xorl %eax, %edi # ^d |
| 1447 | xorl %edx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1448 | addl -64+4*14(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1449 | addl %edi, %ebx # e += (c ^ d ^ b) |
| 1450 | movl %ecx, %esi # |
| 1451 | roll $5, %esi # rotl32(a,5) |
| 1452 | addl %esi, %ebx # e += rotl32(a,5) |
| 1453 | rorl $2, %edx # b = rotl32(b,30) |
| 1454 | # 79 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1455 | movl %edx, %edi # c |
| 1456 | xorl %ebp, %edi # ^d |
| 1457 | xorl %ecx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1458 | addl -64+4*15(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1459 | addl %edi, %eax # e += (c ^ d ^ b) |
| 1460 | movl %ebx, %esi # |
| 1461 | roll $5, %esi # rotl32(a,5) |
| 1462 | addl %esi, %eax # e += rotl32(a,5) |
| 1463 | rorl $2, %ecx # b = rotl32(b,30) |
| 1464 | |
| 1465 | popq %rdi # |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1466 | popq %r12 # |
Denys Vlasenko | 805ecec | 2022-01-08 00:41:09 +0100 | [diff] [blame] | 1467 | addl %eax, 80(%rdi) # ctx->hash[0] += a |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1468 | popq %r13 # |
Denys Vlasenko | 805ecec | 2022-01-08 00:41:09 +0100 | [diff] [blame] | 1469 | addl %ebx, 84(%rdi) # ctx->hash[1] += b |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1470 | popq %r14 # |
Denys Vlasenko | 805ecec | 2022-01-08 00:41:09 +0100 | [diff] [blame] | 1471 | addl %ecx, 88(%rdi) # ctx->hash[2] += c |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 1472 | # popq %r15 # |
Denys Vlasenko | 805ecec | 2022-01-08 00:41:09 +0100 | [diff] [blame] | 1473 | addl %edx, 92(%rdi) # ctx->hash[3] += d |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 1474 | popq %rbx # |
Denys Vlasenko | 805ecec | 2022-01-08 00:41:09 +0100 | [diff] [blame] | 1475 | addl %ebp, 96(%rdi) # ctx->hash[4] += e |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 1476 | popq %rbp # |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1477 | |
| 1478 | ret |
| 1479 | .size sha1_process_block64, .-sha1_process_block64 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1480 | |
| 1481 | .section .rodata.cst16.sha1const, "aM", @progbits, 16 |
Denys Vlasenko | 6472ac9 | 2022-02-03 14:15:20 +0100 | [diff] [blame] | 1482 | .balign 16 |
Denys Vlasenko | 4923f74 | 2022-02-08 03:29:16 +0100 | [diff] [blame] | 1483 | sha1const: |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1484 | .long 0x5A827999 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1485 | .long 0x6ED9EBA1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1486 | .long 0x8F1BBCDC |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1487 | .long 0xCA62C1D6 |
| 1488 | |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1489 | #endif |