Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1 | ### Generated by hash_md5_sha_x86-64.S.sh ### |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 2 | |
| 3 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 4 | .section .text.sha1_process_block64, "ax", @progbits |
Denys Vlasenko | 805ecec | 2022-01-08 00:41:09 +0100 | [diff] [blame] | 5 | .globl sha1_process_block64 |
| 6 | .hidden sha1_process_block64 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 7 | .type sha1_process_block64, @function |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 8 | |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 9 | .balign 8 # allow decoders to fetch at least 5 first insns |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 10 | sha1_process_block64: |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 11 | pushq %rbp # 1 byte insn |
| 12 | pushq %rbx # 1 byte insn |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 13 | # pushq %r15 # 2 byte insn |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 14 | pushq %r14 # 2 byte insn |
| 15 | pushq %r13 # 2 byte insn |
| 16 | pushq %r12 # 2 byte insn |
Denys Vlasenko | 947bef0 | 2022-01-03 13:00:07 +0100 | [diff] [blame] | 17 | pushq %rdi # we need ctx at the end |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 18 | |
| 19 | #Register and stack use: |
| 20 | # eax..edx: a..d |
| 21 | # ebp: e |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 22 | # esi,edi,r8..r14: temps |
| 23 | # r15: unused |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 24 | # xmm0..xmm3: W[] |
| 25 | # xmm4,xmm5: temps |
| 26 | # xmm6: current round constant |
Denys Vlasenko | 4923f74 | 2022-02-08 03:29:16 +0100 | [diff] [blame] | 27 | # xmm7: all round constants |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 28 | # -64(%rsp): area for passing RCONST + W[] from vector to integer units |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 29 | |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 30 | movl 80(%rdi), %eax # a = ctx->hash[0] |
| 31 | movl 84(%rdi), %ebx # b = ctx->hash[1] |
| 32 | movl 88(%rdi), %ecx # c = ctx->hash[2] |
| 33 | movl 92(%rdi), %edx # d = ctx->hash[3] |
| 34 | movl 96(%rdi), %ebp # e = ctx->hash[4] |
| 35 | |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 36 | movaps sha1const(%rip), %xmm7 |
| 37 | pshufd $0x00, %xmm7, %xmm6 |
| 38 | |
| 39 | # Load W[] to xmm0..3, byteswapping on the fly. |
| 40 | # |
| 41 | # For iterations 0..15, we pass W[] in rsi,r8..r14 |
| 42 | # for use in RD1As instead of spilling them to stack. |
| 43 | # We lose parallelized addition of RCONST, but LEA |
| 44 | # can do two additions at once, so it is probably a wash. |
| 45 | # (We use rsi instead of rN because this makes two |
| 46 | # LEAs in two first RD1As shorter by one byte). |
| 47 | movq 4*0(%rdi), %rsi |
| 48 | movq 4*2(%rdi), %r8 |
| 49 | bswapq %rsi |
| 50 | bswapq %r8 |
| 51 | rolq $32, %rsi # rsi = W[1]:W[0] |
| 52 | rolq $32, %r8 # r8 = W[3]:W[2] |
| 53 | movq %rsi, %xmm0 |
| 54 | movq %r8, %xmm4 |
| 55 | punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) |
| 56 | # movaps %xmm0, %xmm4 # add RCONST, spill to stack |
| 57 | # paddd %xmm6, %xmm4 |
| 58 | # movups %xmm4, -64+16*0(%rsp) |
| 59 | |
| 60 | movq 4*4(%rdi), %r9 |
| 61 | movq 4*6(%rdi), %r10 |
| 62 | bswapq %r9 |
| 63 | bswapq %r10 |
| 64 | rolq $32, %r9 # r9 = W[5]:W[4] |
| 65 | rolq $32, %r10 # r10 = W[7]:W[6] |
| 66 | movq %r9, %xmm1 |
| 67 | movq %r10, %xmm4 |
| 68 | punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) |
| 69 | |
| 70 | movq 4*8(%rdi), %r11 |
| 71 | movq 4*10(%rdi), %r12 |
| 72 | bswapq %r11 |
| 73 | bswapq %r12 |
Denys Vlasenko | 1f272c0 | 2022-02-11 23:03:27 +0100 | [diff] [blame] | 74 | rolq $32, %r11 # r11 = W[9]:W[8] |
| 75 | rolq $32, %r12 # r12 = W[11]:W[10] |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 76 | movq %r11, %xmm2 |
| 77 | movq %r12, %xmm4 |
| 78 | punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) |
| 79 | |
| 80 | movq 4*12(%rdi), %r13 |
| 81 | movq 4*14(%rdi), %r14 |
| 82 | bswapq %r13 |
| 83 | bswapq %r14 |
Denys Vlasenko | 1f272c0 | 2022-02-11 23:03:27 +0100 | [diff] [blame] | 84 | rolq $32, %r13 # r13 = W[13]:W[12] |
| 85 | rolq $32, %r14 # r14 = W[15]:W[14] |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 86 | movq %r13, %xmm3 |
| 87 | movq %r14, %xmm4 |
| 88 | punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) |
| 89 | |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 90 | # 0 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 91 | leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 92 | shrq $32, %rsi |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 93 | movl %ecx, %edi # c |
| 94 | xorl %edx, %edi # ^d |
| 95 | andl %ebx, %edi # &b |
| 96 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 97 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 98 | movl %eax, %edi # |
| 99 | roll $5, %edi # rotl32(a,5) |
| 100 | addl %edi, %ebp # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 101 | rorl $2, %ebx # b = rotl32(b,30) |
| 102 | # 1 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 103 | leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 104 | movl %ebx, %edi # c |
| 105 | xorl %ecx, %edi # ^d |
| 106 | andl %eax, %edi # &b |
| 107 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 108 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 109 | movl %ebp, %edi # |
| 110 | roll $5, %edi # rotl32(a,5) |
| 111 | addl %edi, %edx # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 112 | rorl $2, %eax # b = rotl32(b,30) |
| 113 | # 2 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 114 | leal 0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n] |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 115 | shrq $32, %r8 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 116 | movl %eax, %edi # c |
| 117 | xorl %ebx, %edi # ^d |
| 118 | andl %ebp, %edi # &b |
| 119 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 120 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 121 | movl %edx, %edi # |
| 122 | roll $5, %edi # rotl32(a,5) |
| 123 | addl %edi, %ecx # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 124 | rorl $2, %ebp # b = rotl32(b,30) |
| 125 | # 3 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 126 | leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 127 | movl %ebp, %edi # c |
| 128 | xorl %eax, %edi # ^d |
| 129 | andl %edx, %edi # &b |
| 130 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 131 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 132 | movl %ecx, %edi # |
| 133 | roll $5, %edi # rotl32(a,5) |
| 134 | addl %edi, %ebx # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 135 | rorl $2, %edx # b = rotl32(b,30) |
| 136 | # 4 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 137 | leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 138 | shrq $32, %r9 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 139 | movl %edx, %edi # c |
| 140 | xorl %ebp, %edi # ^d |
| 141 | andl %ecx, %edi # &b |
| 142 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 143 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 144 | movl %ebx, %edi # |
| 145 | roll $5, %edi # rotl32(a,5) |
| 146 | addl %edi, %eax # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 147 | rorl $2, %ecx # b = rotl32(b,30) |
| 148 | # 5 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 149 | leal 0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 150 | movl %ecx, %edi # c |
| 151 | xorl %edx, %edi # ^d |
| 152 | andl %ebx, %edi # &b |
| 153 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 154 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 155 | movl %eax, %edi # |
| 156 | roll $5, %edi # rotl32(a,5) |
| 157 | addl %edi, %ebp # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 158 | rorl $2, %ebx # b = rotl32(b,30) |
| 159 | # 6 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 160 | leal 0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n] |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 161 | shrq $32, %r10 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 162 | movl %ebx, %edi # c |
| 163 | xorl %ecx, %edi # ^d |
| 164 | andl %eax, %edi # &b |
| 165 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 166 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 167 | movl %ebp, %edi # |
| 168 | roll $5, %edi # rotl32(a,5) |
| 169 | addl %edi, %edx # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 170 | rorl $2, %eax # b = rotl32(b,30) |
| 171 | # 7 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 172 | leal 0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 173 | movl %eax, %edi # c |
| 174 | xorl %ebx, %edi # ^d |
| 175 | andl %ebp, %edi # &b |
| 176 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 177 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 178 | movl %edx, %edi # |
| 179 | roll $5, %edi # rotl32(a,5) |
| 180 | addl %edi, %ecx # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 181 | rorl $2, %ebp # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 182 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) |
| 183 | movaps %xmm3, %xmm4 |
| 184 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 185 | # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 186 | # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 187 | # same result as above, but shorter and faster: |
| 188 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 189 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 190 | movaps %xmm0, %xmm5 |
| 191 | shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 192 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 193 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 194 | xorps %xmm5, %xmm0 # ^ |
| 195 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 196 | movaps %xmm0, %xmm5 |
| 197 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 198 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 199 | paddd %xmm0, %xmm0 # shift left by 1 |
| 200 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 201 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 202 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 203 | movaps %xmm5, %xmm4 |
| 204 | pslld $2, %xmm5 |
| 205 | psrld $30, %xmm4 |
| 206 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 207 | xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 |
| 208 | xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 209 | movaps %xmm0, %xmm5 |
| 210 | paddd %xmm6, %xmm5 |
| 211 | movups %xmm5, -64+16*0(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 212 | # 8 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 213 | leal 0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n] |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 214 | shrq $32, %r11 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 215 | movl %ebp, %edi # c |
| 216 | xorl %eax, %edi # ^d |
| 217 | andl %edx, %edi # &b |
| 218 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 219 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 220 | movl %ecx, %edi # |
| 221 | roll $5, %edi # rotl32(a,5) |
| 222 | addl %edi, %ebx # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 223 | rorl $2, %edx # b = rotl32(b,30) |
| 224 | # 9 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 225 | leal 0x5A827999(%rax,%r11), %eax # e += RCONST + W[n] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 226 | movl %edx, %edi # c |
| 227 | xorl %ebp, %edi # ^d |
| 228 | andl %ecx, %edi # &b |
| 229 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 230 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 231 | movl %ebx, %edi # |
| 232 | roll $5, %edi # rotl32(a,5) |
| 233 | addl %edi, %eax # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 234 | rorl $2, %ecx # b = rotl32(b,30) |
| 235 | # 10 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 236 | leal 0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n] |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 237 | shrq $32, %r12 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 238 | movl %ecx, %edi # c |
| 239 | xorl %edx, %edi # ^d |
| 240 | andl %ebx, %edi # &b |
| 241 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 242 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 243 | movl %eax, %edi # |
| 244 | roll $5, %edi # rotl32(a,5) |
| 245 | addl %edi, %ebp # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 246 | rorl $2, %ebx # b = rotl32(b,30) |
| 247 | # 11 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 248 | leal 0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 249 | movl %ebx, %edi # c |
| 250 | xorl %ecx, %edi # ^d |
| 251 | andl %eax, %edi # &b |
| 252 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 253 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 254 | movl %ebp, %edi # |
| 255 | roll $5, %edi # rotl32(a,5) |
| 256 | addl %edi, %edx # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 257 | rorl $2, %eax # b = rotl32(b,30) |
Denys Vlasenko | 4923f74 | 2022-02-08 03:29:16 +0100 | [diff] [blame] | 258 | pshufd $0x55, %xmm7, %xmm6 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 259 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) |
| 260 | movaps %xmm0, %xmm4 |
| 261 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 262 | # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 263 | # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 264 | # same result as above, but shorter and faster: |
| 265 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 266 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 267 | movaps %xmm1, %xmm5 |
| 268 | shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 269 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 270 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 271 | xorps %xmm5, %xmm1 # ^ |
| 272 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 273 | movaps %xmm1, %xmm5 |
| 274 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 275 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 276 | paddd %xmm1, %xmm1 # shift left by 1 |
| 277 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 278 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 279 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 280 | movaps %xmm5, %xmm4 |
| 281 | pslld $2, %xmm5 |
| 282 | psrld $30, %xmm4 |
| 283 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 284 | xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 |
| 285 | xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 286 | movaps %xmm1, %xmm5 |
| 287 | paddd %xmm6, %xmm5 |
| 288 | movups %xmm5, -64+16*1(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 289 | # 12 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 290 | leal 0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n] |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 291 | shrq $32, %r13 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 292 | movl %eax, %edi # c |
| 293 | xorl %ebx, %edi # ^d |
| 294 | andl %ebp, %edi # &b |
| 295 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 296 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 297 | movl %edx, %edi # |
| 298 | roll $5, %edi # rotl32(a,5) |
| 299 | addl %edi, %ecx # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 300 | rorl $2, %ebp # b = rotl32(b,30) |
| 301 | # 13 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 302 | leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 303 | movl %ebp, %edi # c |
| 304 | xorl %eax, %edi # ^d |
| 305 | andl %edx, %edi # &b |
| 306 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 307 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 308 | movl %ecx, %edi # |
| 309 | roll $5, %edi # rotl32(a,5) |
| 310 | addl %edi, %ebx # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 311 | rorl $2, %edx # b = rotl32(b,30) |
| 312 | # 14 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 313 | leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 314 | shrq $32, %r14 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 315 | movl %edx, %edi # c |
| 316 | xorl %ebp, %edi # ^d |
| 317 | andl %ecx, %edi # &b |
| 318 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 319 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 320 | movl %ebx, %edi # |
| 321 | roll $5, %edi # rotl32(a,5) |
| 322 | addl %edi, %eax # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 323 | rorl $2, %ecx # b = rotl32(b,30) |
| 324 | # 15 |
Denys Vlasenko | dda77e8 | 2022-02-11 14:53:26 +0100 | [diff] [blame] | 325 | leal 0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 326 | movl %ecx, %edi # c |
| 327 | xorl %edx, %edi # ^d |
| 328 | andl %ebx, %edi # &b |
| 329 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 330 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 331 | movl %eax, %edi # |
| 332 | roll $5, %edi # rotl32(a,5) |
| 333 | addl %edi, %ebp # e += rotl32(a,5) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 334 | rorl $2, %ebx # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 335 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) |
| 336 | movaps %xmm1, %xmm4 |
| 337 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 338 | # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 339 | # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 340 | # same result as above, but shorter and faster: |
| 341 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 342 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 343 | movaps %xmm2, %xmm5 |
| 344 | shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 345 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 346 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 347 | xorps %xmm5, %xmm2 # ^ |
| 348 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 349 | movaps %xmm2, %xmm5 |
| 350 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 351 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 352 | paddd %xmm2, %xmm2 # shift left by 1 |
| 353 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 354 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 355 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 356 | movaps %xmm5, %xmm4 |
| 357 | pslld $2, %xmm5 |
| 358 | psrld $30, %xmm4 |
| 359 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 360 | xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 |
| 361 | xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 362 | movaps %xmm2, %xmm5 |
| 363 | paddd %xmm6, %xmm5 |
| 364 | movups %xmm5, -64+16*2(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 365 | # 16 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 366 | movl %ebx, %edi # c |
| 367 | xorl %ecx, %edi # ^d |
| 368 | andl %eax, %edi # &b |
| 369 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 370 | addl -64+4*0(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 371 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
| 372 | movl %ebp, %esi # |
| 373 | roll $5, %esi # rotl32(a,5) |
| 374 | addl %esi, %edx # e += rotl32(a,5) |
| 375 | rorl $2, %eax # b = rotl32(b,30) |
| 376 | # 17 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 377 | movl %eax, %edi # c |
| 378 | xorl %ebx, %edi # ^d |
| 379 | andl %ebp, %edi # &b |
| 380 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 381 | addl -64+4*1(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 382 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
| 383 | movl %edx, %esi # |
| 384 | roll $5, %esi # rotl32(a,5) |
| 385 | addl %esi, %ecx # e += rotl32(a,5) |
| 386 | rorl $2, %ebp # b = rotl32(b,30) |
| 387 | # 18 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 388 | movl %ebp, %edi # c |
| 389 | xorl %eax, %edi # ^d |
| 390 | andl %edx, %edi # &b |
| 391 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 392 | addl -64+4*2(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 393 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
| 394 | movl %ecx, %esi # |
| 395 | roll $5, %esi # rotl32(a,5) |
| 396 | addl %esi, %ebx # e += rotl32(a,5) |
| 397 | rorl $2, %edx # b = rotl32(b,30) |
| 398 | # 19 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 399 | movl %edx, %edi # c |
| 400 | xorl %ebp, %edi # ^d |
| 401 | andl %ecx, %edi # &b |
| 402 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 403 | addl -64+4*3(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 404 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
| 405 | movl %ebx, %esi # |
| 406 | roll $5, %esi # rotl32(a,5) |
| 407 | addl %esi, %eax # e += rotl32(a,5) |
| 408 | rorl $2, %ecx # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 409 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) |
| 410 | movaps %xmm2, %xmm4 |
| 411 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 412 | # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 413 | # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 414 | # same result as above, but shorter and faster: |
| 415 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 416 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 417 | movaps %xmm3, %xmm5 |
| 418 | shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 419 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 420 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 421 | xorps %xmm5, %xmm3 # ^ |
| 422 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 423 | movaps %xmm3, %xmm5 |
| 424 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 425 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 426 | paddd %xmm3, %xmm3 # shift left by 1 |
| 427 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 428 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 429 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 430 | movaps %xmm5, %xmm4 |
| 431 | pslld $2, %xmm5 |
| 432 | psrld $30, %xmm4 |
| 433 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 434 | xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 |
| 435 | xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 436 | movaps %xmm3, %xmm5 |
| 437 | paddd %xmm6, %xmm5 |
| 438 | movups %xmm5, -64+16*3(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 439 | # 20 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 440 | movl %ecx, %edi # c |
| 441 | xorl %edx, %edi # ^d |
| 442 | xorl %ebx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 443 | addl -64+4*4(%rsp), %ebp # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 444 | addl %edi, %ebp # e += (c ^ d ^ b) |
| 445 | movl %eax, %esi # |
| 446 | roll $5, %esi # rotl32(a,5) |
| 447 | addl %esi, %ebp # e += rotl32(a,5) |
| 448 | rorl $2, %ebx # b = rotl32(b,30) |
| 449 | # 21 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 450 | movl %ebx, %edi # c |
| 451 | xorl %ecx, %edi # ^d |
| 452 | xorl %eax, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 453 | addl -64+4*5(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 454 | addl %edi, %edx # e += (c ^ d ^ b) |
| 455 | movl %ebp, %esi # |
| 456 | roll $5, %esi # rotl32(a,5) |
| 457 | addl %esi, %edx # e += rotl32(a,5) |
| 458 | rorl $2, %eax # b = rotl32(b,30) |
| 459 | # 22 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 460 | movl %eax, %edi # c |
| 461 | xorl %ebx, %edi # ^d |
| 462 | xorl %ebp, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 463 | addl -64+4*6(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 464 | addl %edi, %ecx # e += (c ^ d ^ b) |
| 465 | movl %edx, %esi # |
| 466 | roll $5, %esi # rotl32(a,5) |
| 467 | addl %esi, %ecx # e += rotl32(a,5) |
| 468 | rorl $2, %ebp # b = rotl32(b,30) |
| 469 | # 23 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 470 | movl %ebp, %edi # c |
| 471 | xorl %eax, %edi # ^d |
| 472 | xorl %edx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 473 | addl -64+4*7(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 474 | addl %edi, %ebx # e += (c ^ d ^ b) |
| 475 | movl %ecx, %esi # |
| 476 | roll $5, %esi # rotl32(a,5) |
| 477 | addl %esi, %ebx # e += rotl32(a,5) |
| 478 | rorl $2, %edx # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 479 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) |
| 480 | movaps %xmm3, %xmm4 |
| 481 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 482 | # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 483 | # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 484 | # same result as above, but shorter and faster: |
| 485 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 486 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 487 | movaps %xmm0, %xmm5 |
| 488 | shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 489 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 490 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 491 | xorps %xmm5, %xmm0 # ^ |
| 492 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 493 | movaps %xmm0, %xmm5 |
| 494 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 495 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 496 | paddd %xmm0, %xmm0 # shift left by 1 |
| 497 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 498 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 499 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 500 | movaps %xmm5, %xmm4 |
| 501 | pslld $2, %xmm5 |
| 502 | psrld $30, %xmm4 |
| 503 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 504 | xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 |
| 505 | xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 506 | movaps %xmm0, %xmm5 |
| 507 | paddd %xmm6, %xmm5 |
| 508 | movups %xmm5, -64+16*0(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 509 | # 24 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 510 | movl %edx, %edi # c |
| 511 | xorl %ebp, %edi # ^d |
| 512 | xorl %ecx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 513 | addl -64+4*8(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 514 | addl %edi, %eax # e += (c ^ d ^ b) |
| 515 | movl %ebx, %esi # |
| 516 | roll $5, %esi # rotl32(a,5) |
| 517 | addl %esi, %eax # e += rotl32(a,5) |
| 518 | rorl $2, %ecx # b = rotl32(b,30) |
| 519 | # 25 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 520 | movl %ecx, %edi # c |
| 521 | xorl %edx, %edi # ^d |
| 522 | xorl %ebx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 523 | addl -64+4*9(%rsp), %ebp # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 524 | addl %edi, %ebp # e += (c ^ d ^ b) |
| 525 | movl %eax, %esi # |
| 526 | roll $5, %esi # rotl32(a,5) |
| 527 | addl %esi, %ebp # e += rotl32(a,5) |
| 528 | rorl $2, %ebx # b = rotl32(b,30) |
| 529 | # 26 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 530 | movl %ebx, %edi # c |
| 531 | xorl %ecx, %edi # ^d |
| 532 | xorl %eax, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 533 | addl -64+4*10(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 534 | addl %edi, %edx # e += (c ^ d ^ b) |
| 535 | movl %ebp, %esi # |
| 536 | roll $5, %esi # rotl32(a,5) |
| 537 | addl %esi, %edx # e += rotl32(a,5) |
| 538 | rorl $2, %eax # b = rotl32(b,30) |
| 539 | # 27 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 540 | movl %eax, %edi # c |
| 541 | xorl %ebx, %edi # ^d |
| 542 | xorl %ebp, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 543 | addl -64+4*11(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 544 | addl %edi, %ecx # e += (c ^ d ^ b) |
| 545 | movl %edx, %esi # |
| 546 | roll $5, %esi # rotl32(a,5) |
| 547 | addl %esi, %ecx # e += rotl32(a,5) |
| 548 | rorl $2, %ebp # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 549 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) |
| 550 | movaps %xmm0, %xmm4 |
| 551 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 552 | # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 553 | # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 554 | # same result as above, but shorter and faster: |
| 555 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 556 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 557 | movaps %xmm1, %xmm5 |
| 558 | shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 559 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 560 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 561 | xorps %xmm5, %xmm1 # ^ |
| 562 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 563 | movaps %xmm1, %xmm5 |
| 564 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 565 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 566 | paddd %xmm1, %xmm1 # shift left by 1 |
| 567 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 568 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 569 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 570 | movaps %xmm5, %xmm4 |
| 571 | pslld $2, %xmm5 |
| 572 | psrld $30, %xmm4 |
| 573 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 574 | xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 |
| 575 | xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 576 | movaps %xmm1, %xmm5 |
| 577 | paddd %xmm6, %xmm5 |
| 578 | movups %xmm5, -64+16*1(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 579 | # 28 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 580 | movl %ebp, %edi # c |
| 581 | xorl %eax, %edi # ^d |
| 582 | xorl %edx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 583 | addl -64+4*12(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 584 | addl %edi, %ebx # e += (c ^ d ^ b) |
| 585 | movl %ecx, %esi # |
| 586 | roll $5, %esi # rotl32(a,5) |
| 587 | addl %esi, %ebx # e += rotl32(a,5) |
| 588 | rorl $2, %edx # b = rotl32(b,30) |
| 589 | # 29 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 590 | movl %edx, %edi # c |
| 591 | xorl %ebp, %edi # ^d |
| 592 | xorl %ecx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 593 | addl -64+4*13(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 594 | addl %edi, %eax # e += (c ^ d ^ b) |
| 595 | movl %ebx, %esi # |
| 596 | roll $5, %esi # rotl32(a,5) |
| 597 | addl %esi, %eax # e += rotl32(a,5) |
| 598 | rorl $2, %ecx # b = rotl32(b,30) |
| 599 | # 30 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 600 | movl %ecx, %edi # c |
| 601 | xorl %edx, %edi # ^d |
| 602 | xorl %ebx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 603 | addl -64+4*14(%rsp), %ebp # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 604 | addl %edi, %ebp # e += (c ^ d ^ b) |
| 605 | movl %eax, %esi # |
| 606 | roll $5, %esi # rotl32(a,5) |
| 607 | addl %esi, %ebp # e += rotl32(a,5) |
| 608 | rorl $2, %ebx # b = rotl32(b,30) |
| 609 | # 31 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 610 | movl %ebx, %edi # c |
| 611 | xorl %ecx, %edi # ^d |
| 612 | xorl %eax, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 613 | addl -64+4*15(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 614 | addl %edi, %edx # e += (c ^ d ^ b) |
| 615 | movl %ebp, %esi # |
| 616 | roll $5, %esi # rotl32(a,5) |
| 617 | addl %esi, %edx # e += rotl32(a,5) |
| 618 | rorl $2, %eax # b = rotl32(b,30) |
Denys Vlasenko | 4923f74 | 2022-02-08 03:29:16 +0100 | [diff] [blame] | 619 | pshufd $0xaa, %xmm7, %xmm6 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 620 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) |
| 621 | movaps %xmm1, %xmm4 |
| 622 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 623 | # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 624 | # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 625 | # same result as above, but shorter and faster: |
| 626 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 627 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 628 | movaps %xmm2, %xmm5 |
| 629 | shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 630 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 631 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 632 | xorps %xmm5, %xmm2 # ^ |
| 633 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 634 | movaps %xmm2, %xmm5 |
| 635 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 636 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 637 | paddd %xmm2, %xmm2 # shift left by 1 |
| 638 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 639 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 640 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 641 | movaps %xmm5, %xmm4 |
| 642 | pslld $2, %xmm5 |
| 643 | psrld $30, %xmm4 |
| 644 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 645 | xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 |
| 646 | xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 647 | movaps %xmm2, %xmm5 |
| 648 | paddd %xmm6, %xmm5 |
| 649 | movups %xmm5, -64+16*2(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 650 | # 32 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 651 | movl %eax, %edi # c |
| 652 | xorl %ebx, %edi # ^d |
| 653 | xorl %ebp, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 654 | addl -64+4*0(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 655 | addl %edi, %ecx # e += (c ^ d ^ b) |
| 656 | movl %edx, %esi # |
| 657 | roll $5, %esi # rotl32(a,5) |
| 658 | addl %esi, %ecx # e += rotl32(a,5) |
| 659 | rorl $2, %ebp # b = rotl32(b,30) |
| 660 | # 33 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 661 | movl %ebp, %edi # c |
| 662 | xorl %eax, %edi # ^d |
| 663 | xorl %edx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 664 | addl -64+4*1(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 665 | addl %edi, %ebx # e += (c ^ d ^ b) |
| 666 | movl %ecx, %esi # |
| 667 | roll $5, %esi # rotl32(a,5) |
| 668 | addl %esi, %ebx # e += rotl32(a,5) |
| 669 | rorl $2, %edx # b = rotl32(b,30) |
| 670 | # 34 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 671 | movl %edx, %edi # c |
| 672 | xorl %ebp, %edi # ^d |
| 673 | xorl %ecx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 674 | addl -64+4*2(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 675 | addl %edi, %eax # e += (c ^ d ^ b) |
| 676 | movl %ebx, %esi # |
| 677 | roll $5, %esi # rotl32(a,5) |
| 678 | addl %esi, %eax # e += rotl32(a,5) |
| 679 | rorl $2, %ecx # b = rotl32(b,30) |
| 680 | # 35 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 681 | movl %ecx, %edi # c |
| 682 | xorl %edx, %edi # ^d |
| 683 | xorl %ebx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 684 | addl -64+4*3(%rsp), %ebp # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 685 | addl %edi, %ebp # e += (c ^ d ^ b) |
| 686 | movl %eax, %esi # |
| 687 | roll $5, %esi # rotl32(a,5) |
| 688 | addl %esi, %ebp # e += rotl32(a,5) |
| 689 | rorl $2, %ebx # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 690 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) |
| 691 | movaps %xmm2, %xmm4 |
| 692 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 693 | # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 694 | # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 695 | # same result as above, but shorter and faster: |
| 696 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 697 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 698 | movaps %xmm3, %xmm5 |
| 699 | shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 700 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 701 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 702 | xorps %xmm5, %xmm3 # ^ |
| 703 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 704 | movaps %xmm3, %xmm5 |
| 705 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 706 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 707 | paddd %xmm3, %xmm3 # shift left by 1 |
| 708 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 709 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 710 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 711 | movaps %xmm5, %xmm4 |
| 712 | pslld $2, %xmm5 |
| 713 | psrld $30, %xmm4 |
| 714 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 715 | xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 |
| 716 | xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 717 | movaps %xmm3, %xmm5 |
| 718 | paddd %xmm6, %xmm5 |
| 719 | movups %xmm5, -64+16*3(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 720 | # 36 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 721 | movl %ebx, %edi # c |
| 722 | xorl %ecx, %edi # ^d |
| 723 | xorl %eax, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 724 | addl -64+4*4(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 725 | addl %edi, %edx # e += (c ^ d ^ b) |
| 726 | movl %ebp, %esi # |
| 727 | roll $5, %esi # rotl32(a,5) |
| 728 | addl %esi, %edx # e += rotl32(a,5) |
| 729 | rorl $2, %eax # b = rotl32(b,30) |
| 730 | # 37 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 731 | movl %eax, %edi # c |
| 732 | xorl %ebx, %edi # ^d |
| 733 | xorl %ebp, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 734 | addl -64+4*5(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 735 | addl %edi, %ecx # e += (c ^ d ^ b) |
| 736 | movl %edx, %esi # |
| 737 | roll $5, %esi # rotl32(a,5) |
| 738 | addl %esi, %ecx # e += rotl32(a,5) |
| 739 | rorl $2, %ebp # b = rotl32(b,30) |
| 740 | # 38 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 741 | movl %ebp, %edi # c |
| 742 | xorl %eax, %edi # ^d |
| 743 | xorl %edx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 744 | addl -64+4*6(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 745 | addl %edi, %ebx # e += (c ^ d ^ b) |
| 746 | movl %ecx, %esi # |
| 747 | roll $5, %esi # rotl32(a,5) |
| 748 | addl %esi, %ebx # e += rotl32(a,5) |
| 749 | rorl $2, %edx # b = rotl32(b,30) |
| 750 | # 39 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 751 | movl %edx, %edi # c |
| 752 | xorl %ebp, %edi # ^d |
| 753 | xorl %ecx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 754 | addl -64+4*7(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 755 | addl %edi, %eax # e += (c ^ d ^ b) |
| 756 | movl %ebx, %esi # |
| 757 | roll $5, %esi # rotl32(a,5) |
| 758 | addl %esi, %eax # e += rotl32(a,5) |
| 759 | rorl $2, %ecx # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 760 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) |
| 761 | movaps %xmm3, %xmm4 |
| 762 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 763 | # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 764 | # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 765 | # same result as above, but shorter and faster: |
| 766 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 767 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 768 | movaps %xmm0, %xmm5 |
| 769 | shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 770 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 771 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 772 | xorps %xmm5, %xmm0 # ^ |
| 773 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 774 | movaps %xmm0, %xmm5 |
| 775 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 776 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 777 | paddd %xmm0, %xmm0 # shift left by 1 |
| 778 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 779 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 780 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 781 | movaps %xmm5, %xmm4 |
| 782 | pslld $2, %xmm5 |
| 783 | psrld $30, %xmm4 |
| 784 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 785 | xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 |
| 786 | xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 787 | movaps %xmm0, %xmm5 |
| 788 | paddd %xmm6, %xmm5 |
| 789 | movups %xmm5, -64+16*0(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 790 | # 40 |
| 791 | movl %ebx, %edi # di: b |
| 792 | movl %ebx, %esi # si: b |
| 793 | orl %ecx, %edi # di: b | c |
| 794 | andl %ecx, %esi # si: b & c |
| 795 | andl %edx, %edi # di: (b | c) & d |
| 796 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 797 | addl %edi, %ebp # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 798 | addl -64+4*8(%rsp), %ebp # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 799 | movl %eax, %esi # |
| 800 | roll $5, %esi # rotl32(a,5) |
| 801 | addl %esi, %ebp # e += rotl32(a,5) |
| 802 | rorl $2, %ebx # b = rotl32(b,30) |
| 803 | # 41 |
| 804 | movl %eax, %edi # di: b |
| 805 | movl %eax, %esi # si: b |
| 806 | orl %ebx, %edi # di: b | c |
| 807 | andl %ebx, %esi # si: b & c |
| 808 | andl %ecx, %edi # di: (b | c) & d |
| 809 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 810 | addl %edi, %edx # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 811 | addl -64+4*9(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 812 | movl %ebp, %esi # |
| 813 | roll $5, %esi # rotl32(a,5) |
| 814 | addl %esi, %edx # e += rotl32(a,5) |
| 815 | rorl $2, %eax # b = rotl32(b,30) |
| 816 | # 42 |
| 817 | movl %ebp, %edi # di: b |
| 818 | movl %ebp, %esi # si: b |
| 819 | orl %eax, %edi # di: b | c |
| 820 | andl %eax, %esi # si: b & c |
| 821 | andl %ebx, %edi # di: (b | c) & d |
| 822 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 823 | addl %edi, %ecx # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 824 | addl -64+4*10(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 825 | movl %edx, %esi # |
| 826 | roll $5, %esi # rotl32(a,5) |
| 827 | addl %esi, %ecx # e += rotl32(a,5) |
| 828 | rorl $2, %ebp # b = rotl32(b,30) |
| 829 | # 43 |
| 830 | movl %edx, %edi # di: b |
| 831 | movl %edx, %esi # si: b |
| 832 | orl %ebp, %edi # di: b | c |
| 833 | andl %ebp, %esi # si: b & c |
| 834 | andl %eax, %edi # di: (b | c) & d |
| 835 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 836 | addl %edi, %ebx # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 837 | addl -64+4*11(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 838 | movl %ecx, %esi # |
| 839 | roll $5, %esi # rotl32(a,5) |
| 840 | addl %esi, %ebx # e += rotl32(a,5) |
| 841 | rorl $2, %edx # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 842 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) |
| 843 | movaps %xmm0, %xmm4 |
| 844 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 845 | # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 846 | # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 847 | # same result as above, but shorter and faster: |
| 848 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 849 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 850 | movaps %xmm1, %xmm5 |
| 851 | shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 852 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 853 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 854 | xorps %xmm5, %xmm1 # ^ |
| 855 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 856 | movaps %xmm1, %xmm5 |
| 857 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 858 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 859 | paddd %xmm1, %xmm1 # shift left by 1 |
| 860 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 861 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 862 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 863 | movaps %xmm5, %xmm4 |
| 864 | pslld $2, %xmm5 |
| 865 | psrld $30, %xmm4 |
| 866 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 867 | xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 |
| 868 | xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 869 | movaps %xmm1, %xmm5 |
| 870 | paddd %xmm6, %xmm5 |
| 871 | movups %xmm5, -64+16*1(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 872 | # 44 |
| 873 | movl %ecx, %edi # di: b |
| 874 | movl %ecx, %esi # si: b |
| 875 | orl %edx, %edi # di: b | c |
| 876 | andl %edx, %esi # si: b & c |
| 877 | andl %ebp, %edi # di: (b | c) & d |
| 878 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 879 | addl %edi, %eax # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 880 | addl -64+4*12(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 881 | movl %ebx, %esi # |
| 882 | roll $5, %esi # rotl32(a,5) |
| 883 | addl %esi, %eax # e += rotl32(a,5) |
| 884 | rorl $2, %ecx # b = rotl32(b,30) |
| 885 | # 45 |
| 886 | movl %ebx, %edi # di: b |
| 887 | movl %ebx, %esi # si: b |
| 888 | orl %ecx, %edi # di: b | c |
| 889 | andl %ecx, %esi # si: b & c |
| 890 | andl %edx, %edi # di: (b | c) & d |
| 891 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 892 | addl %edi, %ebp # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 893 | addl -64+4*13(%rsp), %ebp # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 894 | movl %eax, %esi # |
| 895 | roll $5, %esi # rotl32(a,5) |
| 896 | addl %esi, %ebp # e += rotl32(a,5) |
| 897 | rorl $2, %ebx # b = rotl32(b,30) |
| 898 | # 46 |
| 899 | movl %eax, %edi # di: b |
| 900 | movl %eax, %esi # si: b |
| 901 | orl %ebx, %edi # di: b | c |
| 902 | andl %ebx, %esi # si: b & c |
| 903 | andl %ecx, %edi # di: (b | c) & d |
| 904 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 905 | addl %edi, %edx # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 906 | addl -64+4*14(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 907 | movl %ebp, %esi # |
| 908 | roll $5, %esi # rotl32(a,5) |
| 909 | addl %esi, %edx # e += rotl32(a,5) |
| 910 | rorl $2, %eax # b = rotl32(b,30) |
| 911 | # 47 |
| 912 | movl %ebp, %edi # di: b |
| 913 | movl %ebp, %esi # si: b |
| 914 | orl %eax, %edi # di: b | c |
| 915 | andl %eax, %esi # si: b & c |
| 916 | andl %ebx, %edi # di: (b | c) & d |
| 917 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 918 | addl %edi, %ecx # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 919 | addl -64+4*15(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 920 | movl %edx, %esi # |
| 921 | roll $5, %esi # rotl32(a,5) |
| 922 | addl %esi, %ecx # e += rotl32(a,5) |
| 923 | rorl $2, %ebp # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 924 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) |
| 925 | movaps %xmm1, %xmm4 |
| 926 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 927 | # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 928 | # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 929 | # same result as above, but shorter and faster: |
| 930 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 931 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 932 | movaps %xmm2, %xmm5 |
| 933 | shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 934 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 935 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 936 | xorps %xmm5, %xmm2 # ^ |
| 937 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 938 | movaps %xmm2, %xmm5 |
| 939 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 940 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 941 | paddd %xmm2, %xmm2 # shift left by 1 |
| 942 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 943 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 944 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 945 | movaps %xmm5, %xmm4 |
| 946 | pslld $2, %xmm5 |
| 947 | psrld $30, %xmm4 |
| 948 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 949 | xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 |
| 950 | xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 951 | movaps %xmm2, %xmm5 |
| 952 | paddd %xmm6, %xmm5 |
| 953 | movups %xmm5, -64+16*2(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 954 | # 48 |
| 955 | movl %edx, %edi # di: b |
| 956 | movl %edx, %esi # si: b |
| 957 | orl %ebp, %edi # di: b | c |
| 958 | andl %ebp, %esi # si: b & c |
| 959 | andl %eax, %edi # di: (b | c) & d |
| 960 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 961 | addl %edi, %ebx # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 962 | addl -64+4*0(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 963 | movl %ecx, %esi # |
| 964 | roll $5, %esi # rotl32(a,5) |
| 965 | addl %esi, %ebx # e += rotl32(a,5) |
| 966 | rorl $2, %edx # b = rotl32(b,30) |
| 967 | # 49 |
| 968 | movl %ecx, %edi # di: b |
| 969 | movl %ecx, %esi # si: b |
| 970 | orl %edx, %edi # di: b | c |
| 971 | andl %edx, %esi # si: b & c |
| 972 | andl %ebp, %edi # di: (b | c) & d |
| 973 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 974 | addl %edi, %eax # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 975 | addl -64+4*1(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 976 | movl %ebx, %esi # |
| 977 | roll $5, %esi # rotl32(a,5) |
| 978 | addl %esi, %eax # e += rotl32(a,5) |
| 979 | rorl $2, %ecx # b = rotl32(b,30) |
| 980 | # 50 |
| 981 | movl %ebx, %edi # di: b |
| 982 | movl %ebx, %esi # si: b |
| 983 | orl %ecx, %edi # di: b | c |
| 984 | andl %ecx, %esi # si: b & c |
| 985 | andl %edx, %edi # di: (b | c) & d |
| 986 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 987 | addl %edi, %ebp # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 988 | addl -64+4*2(%rsp), %ebp # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 989 | movl %eax, %esi # |
| 990 | roll $5, %esi # rotl32(a,5) |
| 991 | addl %esi, %ebp # e += rotl32(a,5) |
| 992 | rorl $2, %ebx # b = rotl32(b,30) |
| 993 | # 51 |
| 994 | movl %eax, %edi # di: b |
| 995 | movl %eax, %esi # si: b |
| 996 | orl %ebx, %edi # di: b | c |
| 997 | andl %ebx, %esi # si: b & c |
| 998 | andl %ecx, %edi # di: (b | c) & d |
| 999 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1000 | addl %edi, %edx # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1001 | addl -64+4*3(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1002 | movl %ebp, %esi # |
| 1003 | roll $5, %esi # rotl32(a,5) |
| 1004 | addl %esi, %edx # e += rotl32(a,5) |
| 1005 | rorl $2, %eax # b = rotl32(b,30) |
Denys Vlasenko | 4923f74 | 2022-02-08 03:29:16 +0100 | [diff] [blame] | 1006 | pshufd $0xff, %xmm7, %xmm6 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1007 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) |
| 1008 | movaps %xmm2, %xmm4 |
| 1009 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 1010 | # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 1011 | # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 1012 | # same result as above, but shorter and faster: |
| 1013 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 1014 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 1015 | movaps %xmm3, %xmm5 |
| 1016 | shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1017 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 1018 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 1019 | xorps %xmm5, %xmm3 # ^ |
| 1020 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 1021 | movaps %xmm3, %xmm5 |
| 1022 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 1023 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 1024 | paddd %xmm3, %xmm3 # shift left by 1 |
| 1025 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1026 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 1027 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 1028 | movaps %xmm5, %xmm4 |
| 1029 | pslld $2, %xmm5 |
| 1030 | psrld $30, %xmm4 |
| 1031 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 1032 | xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 |
| 1033 | xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 1034 | movaps %xmm3, %xmm5 |
| 1035 | paddd %xmm6, %xmm5 |
| 1036 | movups %xmm5, -64+16*3(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1037 | # 52 |
| 1038 | movl %ebp, %edi # di: b |
| 1039 | movl %ebp, %esi # si: b |
| 1040 | orl %eax, %edi # di: b | c |
| 1041 | andl %eax, %esi # si: b & c |
| 1042 | andl %ebx, %edi # di: (b | c) & d |
| 1043 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1044 | addl %edi, %ecx # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1045 | addl -64+4*4(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1046 | movl %edx, %esi # |
| 1047 | roll $5, %esi # rotl32(a,5) |
| 1048 | addl %esi, %ecx # e += rotl32(a,5) |
| 1049 | rorl $2, %ebp # b = rotl32(b,30) |
| 1050 | # 53 |
| 1051 | movl %edx, %edi # di: b |
| 1052 | movl %edx, %esi # si: b |
| 1053 | orl %ebp, %edi # di: b | c |
| 1054 | andl %ebp, %esi # si: b & c |
| 1055 | andl %eax, %edi # di: (b | c) & d |
| 1056 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1057 | addl %edi, %ebx # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1058 | addl -64+4*5(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1059 | movl %ecx, %esi # |
| 1060 | roll $5, %esi # rotl32(a,5) |
| 1061 | addl %esi, %ebx # e += rotl32(a,5) |
| 1062 | rorl $2, %edx # b = rotl32(b,30) |
| 1063 | # 54 |
| 1064 | movl %ecx, %edi # di: b |
| 1065 | movl %ecx, %esi # si: b |
| 1066 | orl %edx, %edi # di: b | c |
| 1067 | andl %edx, %esi # si: b & c |
| 1068 | andl %ebp, %edi # di: (b | c) & d |
| 1069 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1070 | addl %edi, %eax # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1071 | addl -64+4*6(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1072 | movl %ebx, %esi # |
| 1073 | roll $5, %esi # rotl32(a,5) |
| 1074 | addl %esi, %eax # e += rotl32(a,5) |
| 1075 | rorl $2, %ecx # b = rotl32(b,30) |
| 1076 | # 55 |
| 1077 | movl %ebx, %edi # di: b |
| 1078 | movl %ebx, %esi # si: b |
| 1079 | orl %ecx, %edi # di: b | c |
| 1080 | andl %ecx, %esi # si: b & c |
| 1081 | andl %edx, %edi # di: (b | c) & d |
| 1082 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1083 | addl %edi, %ebp # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1084 | addl -64+4*7(%rsp), %ebp # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1085 | movl %eax, %esi # |
| 1086 | roll $5, %esi # rotl32(a,5) |
| 1087 | addl %esi, %ebp # e += rotl32(a,5) |
| 1088 | rorl $2, %ebx # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1089 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) |
| 1090 | movaps %xmm3, %xmm4 |
| 1091 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 1092 | # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 1093 | # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 1094 | # same result as above, but shorter and faster: |
| 1095 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 1096 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 1097 | movaps %xmm0, %xmm5 |
| 1098 | shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1099 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 1100 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 1101 | xorps %xmm5, %xmm0 # ^ |
| 1102 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 1103 | movaps %xmm0, %xmm5 |
| 1104 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 1105 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 1106 | paddd %xmm0, %xmm0 # shift left by 1 |
| 1107 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1108 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 1109 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 1110 | movaps %xmm5, %xmm4 |
| 1111 | pslld $2, %xmm5 |
| 1112 | psrld $30, %xmm4 |
| 1113 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 1114 | xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 |
| 1115 | xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 1116 | movaps %xmm0, %xmm5 |
| 1117 | paddd %xmm6, %xmm5 |
| 1118 | movups %xmm5, -64+16*0(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1119 | # 56 |
| 1120 | movl %eax, %edi # di: b |
| 1121 | movl %eax, %esi # si: b |
| 1122 | orl %ebx, %edi # di: b | c |
| 1123 | andl %ebx, %esi # si: b & c |
| 1124 | andl %ecx, %edi # di: (b | c) & d |
| 1125 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1126 | addl %edi, %edx # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1127 | addl -64+4*8(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1128 | movl %ebp, %esi # |
| 1129 | roll $5, %esi # rotl32(a,5) |
| 1130 | addl %esi, %edx # e += rotl32(a,5) |
| 1131 | rorl $2, %eax # b = rotl32(b,30) |
| 1132 | # 57 |
| 1133 | movl %ebp, %edi # di: b |
| 1134 | movl %ebp, %esi # si: b |
| 1135 | orl %eax, %edi # di: b | c |
| 1136 | andl %eax, %esi # si: b & c |
| 1137 | andl %ebx, %edi # di: (b | c) & d |
| 1138 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1139 | addl %edi, %ecx # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1140 | addl -64+4*9(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1141 | movl %edx, %esi # |
| 1142 | roll $5, %esi # rotl32(a,5) |
| 1143 | addl %esi, %ecx # e += rotl32(a,5) |
| 1144 | rorl $2, %ebp # b = rotl32(b,30) |
| 1145 | # 58 |
| 1146 | movl %edx, %edi # di: b |
| 1147 | movl %edx, %esi # si: b |
| 1148 | orl %ebp, %edi # di: b | c |
| 1149 | andl %ebp, %esi # si: b & c |
| 1150 | andl %eax, %edi # di: (b | c) & d |
| 1151 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1152 | addl %edi, %ebx # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1153 | addl -64+4*10(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1154 | movl %ecx, %esi # |
| 1155 | roll $5, %esi # rotl32(a,5) |
| 1156 | addl %esi, %ebx # e += rotl32(a,5) |
| 1157 | rorl $2, %edx # b = rotl32(b,30) |
| 1158 | # 59 |
| 1159 | movl %ecx, %edi # di: b |
| 1160 | movl %ecx, %esi # si: b |
| 1161 | orl %edx, %edi # di: b | c |
| 1162 | andl %edx, %esi # si: b & c |
| 1163 | andl %ebp, %edi # di: (b | c) & d |
| 1164 | orl %esi, %edi # ((b | c) & d) | (b & c) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1165 | addl %edi, %eax # += ((b | c) & d) | (b & c) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1166 | addl -64+4*11(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1167 | movl %ebx, %esi # |
| 1168 | roll $5, %esi # rotl32(a,5) |
| 1169 | addl %esi, %eax # e += rotl32(a,5) |
| 1170 | rorl $2, %ecx # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1171 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) |
| 1172 | movaps %xmm0, %xmm4 |
| 1173 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 1174 | # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 1175 | # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 1176 | # same result as above, but shorter and faster: |
| 1177 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 1178 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 1179 | movaps %xmm1, %xmm5 |
| 1180 | shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1181 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 1182 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 1183 | xorps %xmm5, %xmm1 # ^ |
| 1184 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 1185 | movaps %xmm1, %xmm5 |
| 1186 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 1187 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 1188 | paddd %xmm1, %xmm1 # shift left by 1 |
| 1189 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1190 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 1191 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 1192 | movaps %xmm5, %xmm4 |
| 1193 | pslld $2, %xmm5 |
| 1194 | psrld $30, %xmm4 |
| 1195 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 1196 | xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 |
| 1197 | xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 1198 | movaps %xmm1, %xmm5 |
| 1199 | paddd %xmm6, %xmm5 |
| 1200 | movups %xmm5, -64+16*1(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1201 | # 60 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1202 | movl %ecx, %edi # c |
| 1203 | xorl %edx, %edi # ^d |
| 1204 | xorl %ebx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1205 | addl -64+4*12(%rsp), %ebp # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1206 | addl %edi, %ebp # e += (c ^ d ^ b) |
| 1207 | movl %eax, %esi # |
| 1208 | roll $5, %esi # rotl32(a,5) |
| 1209 | addl %esi, %ebp # e += rotl32(a,5) |
| 1210 | rorl $2, %ebx # b = rotl32(b,30) |
| 1211 | # 61 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1212 | movl %ebx, %edi # c |
| 1213 | xorl %ecx, %edi # ^d |
| 1214 | xorl %eax, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1215 | addl -64+4*13(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1216 | addl %edi, %edx # e += (c ^ d ^ b) |
| 1217 | movl %ebp, %esi # |
| 1218 | roll $5, %esi # rotl32(a,5) |
| 1219 | addl %esi, %edx # e += rotl32(a,5) |
| 1220 | rorl $2, %eax # b = rotl32(b,30) |
| 1221 | # 62 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1222 | movl %eax, %edi # c |
| 1223 | xorl %ebx, %edi # ^d |
| 1224 | xorl %ebp, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1225 | addl -64+4*14(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1226 | addl %edi, %ecx # e += (c ^ d ^ b) |
| 1227 | movl %edx, %esi # |
| 1228 | roll $5, %esi # rotl32(a,5) |
| 1229 | addl %esi, %ecx # e += rotl32(a,5) |
| 1230 | rorl $2, %ebp # b = rotl32(b,30) |
| 1231 | # 63 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1232 | movl %ebp, %edi # c |
| 1233 | xorl %eax, %edi # ^d |
| 1234 | xorl %edx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1235 | addl -64+4*15(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1236 | addl %edi, %ebx # e += (c ^ d ^ b) |
| 1237 | movl %ecx, %esi # |
| 1238 | roll $5, %esi # rotl32(a,5) |
| 1239 | addl %esi, %ebx # e += rotl32(a,5) |
| 1240 | rorl $2, %edx # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1241 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) |
| 1242 | movaps %xmm1, %xmm4 |
| 1243 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 1244 | # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 1245 | # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 1246 | # same result as above, but shorter and faster: |
| 1247 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 1248 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 1249 | movaps %xmm2, %xmm5 |
| 1250 | shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1251 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 1252 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 1253 | xorps %xmm5, %xmm2 # ^ |
| 1254 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 1255 | movaps %xmm2, %xmm5 |
| 1256 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 1257 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 1258 | paddd %xmm2, %xmm2 # shift left by 1 |
| 1259 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1260 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 1261 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 1262 | movaps %xmm5, %xmm4 |
| 1263 | pslld $2, %xmm5 |
| 1264 | psrld $30, %xmm4 |
| 1265 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 1266 | xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 |
| 1267 | xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 1268 | movaps %xmm2, %xmm5 |
| 1269 | paddd %xmm6, %xmm5 |
| 1270 | movups %xmm5, -64+16*2(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1271 | # 64 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1272 | movl %edx, %edi # c |
| 1273 | xorl %ebp, %edi # ^d |
| 1274 | xorl %ecx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1275 | addl -64+4*0(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1276 | addl %edi, %eax # e += (c ^ d ^ b) |
| 1277 | movl %ebx, %esi # |
| 1278 | roll $5, %esi # rotl32(a,5) |
| 1279 | addl %esi, %eax # e += rotl32(a,5) |
| 1280 | rorl $2, %ecx # b = rotl32(b,30) |
| 1281 | # 65 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1282 | movl %ecx, %edi # c |
| 1283 | xorl %edx, %edi # ^d |
| 1284 | xorl %ebx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1285 | addl -64+4*1(%rsp), %ebp # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1286 | addl %edi, %ebp # e += (c ^ d ^ b) |
| 1287 | movl %eax, %esi # |
| 1288 | roll $5, %esi # rotl32(a,5) |
| 1289 | addl %esi, %ebp # e += rotl32(a,5) |
| 1290 | rorl $2, %ebx # b = rotl32(b,30) |
| 1291 | # 66 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1292 | movl %ebx, %edi # c |
| 1293 | xorl %ecx, %edi # ^d |
| 1294 | xorl %eax, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1295 | addl -64+4*2(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1296 | addl %edi, %edx # e += (c ^ d ^ b) |
| 1297 | movl %ebp, %esi # |
| 1298 | roll $5, %esi # rotl32(a,5) |
| 1299 | addl %esi, %edx # e += rotl32(a,5) |
| 1300 | rorl $2, %eax # b = rotl32(b,30) |
| 1301 | # 67 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1302 | movl %eax, %edi # c |
| 1303 | xorl %ebx, %edi # ^d |
| 1304 | xorl %ebp, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1305 | addl -64+4*3(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1306 | addl %edi, %ecx # e += (c ^ d ^ b) |
| 1307 | movl %edx, %esi # |
| 1308 | roll $5, %esi # rotl32(a,5) |
| 1309 | addl %esi, %ecx # e += rotl32(a,5) |
| 1310 | rorl $2, %ebp # b = rotl32(b,30) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1311 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) |
| 1312 | movaps %xmm2, %xmm4 |
| 1313 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) |
Denys Vlasenko | c193cbd | 2022-02-07 02:06:18 +0100 | [diff] [blame] | 1314 | # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) |
| 1315 | # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) |
| 1316 | # same result as above, but shorter and faster: |
| 1317 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, |
| 1318 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! |
| 1319 | movaps %xmm3, %xmm5 |
| 1320 | shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1321 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) |
| 1322 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |
| 1323 | xorps %xmm5, %xmm3 # ^ |
| 1324 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup |
| 1325 | movaps %xmm3, %xmm5 |
| 1326 | xorps %xmm4, %xmm4 # rol(W0,1): |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 1327 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) |
| 1328 | paddd %xmm3, %xmm3 # shift left by 1 |
| 1329 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1330 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup |
| 1331 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) |
| 1332 | movaps %xmm5, %xmm4 |
| 1333 | pslld $2, %xmm5 |
| 1334 | psrld $30, %xmm4 |
| 1335 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) |
| 1336 | xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 |
| 1337 | xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) |
| 1338 | movaps %xmm3, %xmm5 |
| 1339 | paddd %xmm6, %xmm5 |
| 1340 | movups %xmm5, -64+16*3(%rsp) |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1341 | # 68 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1342 | movl %ebp, %edi # c |
| 1343 | xorl %eax, %edi # ^d |
| 1344 | xorl %edx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1345 | addl -64+4*4(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1346 | addl %edi, %ebx # e += (c ^ d ^ b) |
| 1347 | movl %ecx, %esi # |
| 1348 | roll $5, %esi # rotl32(a,5) |
| 1349 | addl %esi, %ebx # e += rotl32(a,5) |
| 1350 | rorl $2, %edx # b = rotl32(b,30) |
| 1351 | # 69 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1352 | movl %edx, %edi # c |
| 1353 | xorl %ebp, %edi # ^d |
| 1354 | xorl %ecx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1355 | addl -64+4*5(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1356 | addl %edi, %eax # e += (c ^ d ^ b) |
| 1357 | movl %ebx, %esi # |
| 1358 | roll $5, %esi # rotl32(a,5) |
| 1359 | addl %esi, %eax # e += rotl32(a,5) |
| 1360 | rorl $2, %ecx # b = rotl32(b,30) |
| 1361 | # 70 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1362 | movl %ecx, %edi # c |
| 1363 | xorl %edx, %edi # ^d |
| 1364 | xorl %ebx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1365 | addl -64+4*6(%rsp), %ebp # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1366 | addl %edi, %ebp # e += (c ^ d ^ b) |
| 1367 | movl %eax, %esi # |
| 1368 | roll $5, %esi # rotl32(a,5) |
| 1369 | addl %esi, %ebp # e += rotl32(a,5) |
| 1370 | rorl $2, %ebx # b = rotl32(b,30) |
| 1371 | # 71 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1372 | movl %ebx, %edi # c |
| 1373 | xorl %ecx, %edi # ^d |
| 1374 | xorl %eax, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1375 | addl -64+4*7(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1376 | addl %edi, %edx # e += (c ^ d ^ b) |
| 1377 | movl %ebp, %esi # |
| 1378 | roll $5, %esi # rotl32(a,5) |
| 1379 | addl %esi, %edx # e += rotl32(a,5) |
| 1380 | rorl $2, %eax # b = rotl32(b,30) |
| 1381 | # 72 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1382 | movl %eax, %edi # c |
| 1383 | xorl %ebx, %edi # ^d |
| 1384 | xorl %ebp, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1385 | addl -64+4*8(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1386 | addl %edi, %ecx # e += (c ^ d ^ b) |
| 1387 | movl %edx, %esi # |
| 1388 | roll $5, %esi # rotl32(a,5) |
| 1389 | addl %esi, %ecx # e += rotl32(a,5) |
| 1390 | rorl $2, %ebp # b = rotl32(b,30) |
| 1391 | # 73 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1392 | movl %ebp, %edi # c |
| 1393 | xorl %eax, %edi # ^d |
| 1394 | xorl %edx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1395 | addl -64+4*9(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1396 | addl %edi, %ebx # e += (c ^ d ^ b) |
| 1397 | movl %ecx, %esi # |
| 1398 | roll $5, %esi # rotl32(a,5) |
| 1399 | addl %esi, %ebx # e += rotl32(a,5) |
| 1400 | rorl $2, %edx # b = rotl32(b,30) |
| 1401 | # 74 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1402 | movl %edx, %edi # c |
| 1403 | xorl %ebp, %edi # ^d |
| 1404 | xorl %ecx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1405 | addl -64+4*10(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1406 | addl %edi, %eax # e += (c ^ d ^ b) |
| 1407 | movl %ebx, %esi # |
| 1408 | roll $5, %esi # rotl32(a,5) |
| 1409 | addl %esi, %eax # e += rotl32(a,5) |
| 1410 | rorl $2, %ecx # b = rotl32(b,30) |
| 1411 | # 75 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1412 | movl %ecx, %edi # c |
| 1413 | xorl %edx, %edi # ^d |
| 1414 | xorl %ebx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1415 | addl -64+4*11(%rsp), %ebp # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1416 | addl %edi, %ebp # e += (c ^ d ^ b) |
| 1417 | movl %eax, %esi # |
| 1418 | roll $5, %esi # rotl32(a,5) |
| 1419 | addl %esi, %ebp # e += rotl32(a,5) |
| 1420 | rorl $2, %ebx # b = rotl32(b,30) |
| 1421 | # 76 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1422 | movl %ebx, %edi # c |
| 1423 | xorl %ecx, %edi # ^d |
| 1424 | xorl %eax, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1425 | addl -64+4*12(%rsp), %edx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1426 | addl %edi, %edx # e += (c ^ d ^ b) |
| 1427 | movl %ebp, %esi # |
| 1428 | roll $5, %esi # rotl32(a,5) |
| 1429 | addl %esi, %edx # e += rotl32(a,5) |
| 1430 | rorl $2, %eax # b = rotl32(b,30) |
| 1431 | # 77 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1432 | movl %eax, %edi # c |
| 1433 | xorl %ebx, %edi # ^d |
| 1434 | xorl %ebp, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1435 | addl -64+4*13(%rsp), %ecx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1436 | addl %edi, %ecx # e += (c ^ d ^ b) |
| 1437 | movl %edx, %esi # |
| 1438 | roll $5, %esi # rotl32(a,5) |
| 1439 | addl %esi, %ecx # e += rotl32(a,5) |
| 1440 | rorl $2, %ebp # b = rotl32(b,30) |
| 1441 | # 78 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1442 | movl %ebp, %edi # c |
| 1443 | xorl %eax, %edi # ^d |
| 1444 | xorl %edx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1445 | addl -64+4*14(%rsp), %ebx # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1446 | addl %edi, %ebx # e += (c ^ d ^ b) |
| 1447 | movl %ecx, %esi # |
| 1448 | roll $5, %esi # rotl32(a,5) |
| 1449 | addl %esi, %ebx # e += rotl32(a,5) |
| 1450 | rorl $2, %edx # b = rotl32(b,30) |
| 1451 | # 79 |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1452 | movl %edx, %edi # c |
| 1453 | xorl %ebp, %edi # ^d |
| 1454 | xorl %ecx, %edi # ^b |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1455 | addl -64+4*15(%rsp), %eax # e += RCONST + W[n & 15] |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1456 | addl %edi, %eax # e += (c ^ d ^ b) |
| 1457 | movl %ebx, %esi # |
| 1458 | roll $5, %esi # rotl32(a,5) |
| 1459 | addl %esi, %eax # e += rotl32(a,5) |
| 1460 | rorl $2, %ecx # b = rotl32(b,30) |
| 1461 | |
| 1462 | popq %rdi # |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1463 | popq %r12 # |
Denys Vlasenko | 805ecec | 2022-01-08 00:41:09 +0100 | [diff] [blame] | 1464 | addl %eax, 80(%rdi) # ctx->hash[0] += a |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1465 | popq %r13 # |
Denys Vlasenko | 805ecec | 2022-01-08 00:41:09 +0100 | [diff] [blame] | 1466 | addl %ebx, 84(%rdi) # ctx->hash[1] += b |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1467 | popq %r14 # |
Denys Vlasenko | 805ecec | 2022-01-08 00:41:09 +0100 | [diff] [blame] | 1468 | addl %ecx, 88(%rdi) # ctx->hash[2] += c |
Denys Vlasenko | 205042c | 2022-01-25 17:00:57 +0100 | [diff] [blame] | 1469 | # popq %r15 # |
Denys Vlasenko | 805ecec | 2022-01-08 00:41:09 +0100 | [diff] [blame] | 1470 | addl %edx, 92(%rdi) # ctx->hash[3] += d |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 1471 | popq %rbx # |
Denys Vlasenko | 805ecec | 2022-01-08 00:41:09 +0100 | [diff] [blame] | 1472 | addl %ebp, 96(%rdi) # ctx->hash[4] += e |
Denys Vlasenko | c3cfcc9 | 2022-01-04 01:45:13 +0100 | [diff] [blame] | 1473 | popq %rbp # |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1474 | |
| 1475 | ret |
| 1476 | .size sha1_process_block64, .-sha1_process_block64 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1477 | |
| 1478 | .section .rodata.cst16.sha1const, "aM", @progbits, 16 |
Denys Vlasenko | 6472ac9 | 2022-02-03 14:15:20 +0100 | [diff] [blame] | 1479 | .balign 16 |
Denys Vlasenko | 4923f74 | 2022-02-08 03:29:16 +0100 | [diff] [blame] | 1480 | sha1const: |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1481 | .long 0x5A827999 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1482 | .long 0x6ED9EBA1 |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1483 | .long 0x8F1BBCDC |
Denys Vlasenko | 39369ff | 2022-01-23 09:27:30 +0100 | [diff] [blame] | 1484 | .long 0xCA62C1D6 |
| 1485 | |
Denys Vlasenko | 05fd13e | 2022-01-03 01:57:29 +0100 | [diff] [blame] | 1486 | #endif |