blob: 2cdd22015307d338b6fd9502311db58fbf590d72 [file] [log] [blame]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001### Generated by hash_md5_sha_x86-64.S.sh ###
Denys Vlasenko947bef02022-01-03 13:00:07 +01002
3#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
Ludwig Nussel526625b2022-08-25 12:48:07 +02004#ifdef __linux__
5 .section .note.GNU-stack, "", @progbits
6#endif
Denys Vlasenko205042c2022-01-25 17:00:57 +01007 .section .text.sha1_process_block64, "ax", @progbits
Denys Vlasenko805ecec2022-01-08 00:41:09 +01008 .globl sha1_process_block64
9 .hidden sha1_process_block64
Denys Vlasenko05fd13e2022-01-03 01:57:29 +010010 .type sha1_process_block64, @function
Denys Vlasenko947bef02022-01-03 13:00:07 +010011
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +010012 .balign 8 # allow decoders to fetch at least 5 first insns
Denys Vlasenko05fd13e2022-01-03 01:57:29 +010013sha1_process_block64:
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +010014 pushq %rbp # 1 byte insn
15 pushq %rbx # 1 byte insn
Denys Vlasenko205042c2022-01-25 17:00:57 +010016# pushq %r15 # 2 byte insn
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +010017 pushq %r14 # 2 byte insn
18 pushq %r13 # 2 byte insn
19 pushq %r12 # 2 byte insn
Denys Vlasenko947bef02022-01-03 13:00:07 +010020 pushq %rdi # we need ctx at the end
Denys Vlasenko05fd13e2022-01-03 01:57:29 +010021
22#Register and stack use:
23# eax..edx: a..d
24# ebp: e
Denys Vlasenko205042c2022-01-25 17:00:57 +010025# esi,edi,r8..r14: temps
26# r15: unused
Denys Vlasenko39369ff2022-01-23 09:27:30 +010027# xmm0..xmm3: W[]
28# xmm4,xmm5: temps
29# xmm6: current round constant
Denys Vlasenko4923f742022-02-08 03:29:16 +010030# xmm7: all round constants
Denys Vlasenko39369ff2022-01-23 09:27:30 +010031# -64(%rsp): area for passing RCONST + W[] from vector to integer units
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +010032
Denys Vlasenko05fd13e2022-01-03 01:57:29 +010033 movl 80(%rdi), %eax # a = ctx->hash[0]
34 movl 84(%rdi), %ebx # b = ctx->hash[1]
35 movl 88(%rdi), %ecx # c = ctx->hash[2]
36 movl 92(%rdi), %edx # d = ctx->hash[3]
37 movl 96(%rdi), %ebp # e = ctx->hash[4]
38
Denys Vlasenkodda77e82022-02-11 14:53:26 +010039 movaps sha1const(%rip), %xmm7
40 pshufd $0x00, %xmm7, %xmm6
41
42 # Load W[] to xmm0..3, byteswapping on the fly.
43 #
44 # For iterations 0..15, we pass W[] in rsi,r8..r14
45 # for use in RD1As instead of spilling them to stack.
46 # We lose parallelized addition of RCONST, but LEA
47 # can do two additions at once, so it is probably a wash.
48 # (We use rsi instead of rN because this makes two
49 # LEAs in two first RD1As shorter by one byte).
50 movq 4*0(%rdi), %rsi
51 movq 4*2(%rdi), %r8
52 bswapq %rsi
53 bswapq %r8
54 rolq $32, %rsi # rsi = W[1]:W[0]
55 rolq $32, %r8 # r8 = W[3]:W[2]
56 movq %rsi, %xmm0
57 movq %r8, %xmm4
58 punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
59# movaps %xmm0, %xmm4 # add RCONST, spill to stack
60# paddd %xmm6, %xmm4
61# movups %xmm4, -64+16*0(%rsp)
62
63 movq 4*4(%rdi), %r9
64 movq 4*6(%rdi), %r10
65 bswapq %r9
66 bswapq %r10
67 rolq $32, %r9 # r9 = W[5]:W[4]
68 rolq $32, %r10 # r10 = W[7]:W[6]
69 movq %r9, %xmm1
70 movq %r10, %xmm4
71 punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
72
73 movq 4*8(%rdi), %r11
74 movq 4*10(%rdi), %r12
75 bswapq %r11
76 bswapq %r12
Denys Vlasenko1f272c02022-02-11 23:03:27 +010077 rolq $32, %r11 # r11 = W[9]:W[8]
78 rolq $32, %r12 # r12 = W[11]:W[10]
Denys Vlasenkodda77e82022-02-11 14:53:26 +010079 movq %r11, %xmm2
80 movq %r12, %xmm4
81 punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
82
83 movq 4*12(%rdi), %r13
84 movq 4*14(%rdi), %r14
85 bswapq %r13
86 bswapq %r14
Denys Vlasenko1f272c02022-02-11 23:03:27 +010087 rolq $32, %r13 # r13 = W[13]:W[12]
88 rolq $32, %r14 # r14 = W[15]:W[14]
Denys Vlasenkodda77e82022-02-11 14:53:26 +010089 movq %r13, %xmm3
90 movq %r14, %xmm4
91 punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
92
Denys Vlasenko05fd13e2022-01-03 01:57:29 +010093# 0
Denys Vlasenkodda77e82022-02-11 14:53:26 +010094 leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
Denys Vlasenko205042c2022-01-25 17:00:57 +010095 shrq $32, %rsi
Denys Vlasenko05fd13e2022-01-03 01:57:29 +010096 movl %ecx, %edi # c
97 xorl %edx, %edi # ^d
98 andl %ebx, %edi # &b
99 xorl %edx, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100100 addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100101 movl %eax, %edi #
102 roll $5, %edi # rotl32(a,5)
103 addl %edi, %ebp # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100104 rorl $2, %ebx # b = rotl32(b,30)
105# 1
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100106 leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100107 movl %ebx, %edi # c
108 xorl %ecx, %edi # ^d
109 andl %eax, %edi # &b
110 xorl %ecx, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100111 addl %edi, %edx # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100112 movl %ebp, %edi #
113 roll $5, %edi # rotl32(a,5)
114 addl %edi, %edx # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100115 rorl $2, %eax # b = rotl32(b,30)
116# 2
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100117 leal 0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n]
Denys Vlasenko205042c2022-01-25 17:00:57 +0100118 shrq $32, %r8
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100119 movl %eax, %edi # c
120 xorl %ebx, %edi # ^d
121 andl %ebp, %edi # &b
122 xorl %ebx, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100123 addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100124 movl %edx, %edi #
125 roll $5, %edi # rotl32(a,5)
126 addl %edi, %ecx # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100127 rorl $2, %ebp # b = rotl32(b,30)
128# 3
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100129 leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100130 movl %ebp, %edi # c
131 xorl %eax, %edi # ^d
132 andl %edx, %edi # &b
133 xorl %eax, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100134 addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100135 movl %ecx, %edi #
136 roll $5, %edi # rotl32(a,5)
137 addl %edi, %ebx # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100138 rorl $2, %edx # b = rotl32(b,30)
139# 4
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100140 leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
Denys Vlasenko205042c2022-01-25 17:00:57 +0100141 shrq $32, %r9
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100142 movl %edx, %edi # c
143 xorl %ebp, %edi # ^d
144 andl %ecx, %edi # &b
145 xorl %ebp, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100146 addl %edi, %eax # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100147 movl %ebx, %edi #
148 roll $5, %edi # rotl32(a,5)
149 addl %edi, %eax # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100150 rorl $2, %ecx # b = rotl32(b,30)
151# 5
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100152 leal 0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100153 movl %ecx, %edi # c
154 xorl %edx, %edi # ^d
155 andl %ebx, %edi # &b
156 xorl %edx, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100157 addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100158 movl %eax, %edi #
159 roll $5, %edi # rotl32(a,5)
160 addl %edi, %ebp # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100161 rorl $2, %ebx # b = rotl32(b,30)
162# 6
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100163 leal 0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n]
Denys Vlasenko205042c2022-01-25 17:00:57 +0100164 shrq $32, %r10
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100165 movl %ebx, %edi # c
166 xorl %ecx, %edi # ^d
167 andl %eax, %edi # &b
168 xorl %ecx, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100169 addl %edi, %edx # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100170 movl %ebp, %edi #
171 roll $5, %edi # rotl32(a,5)
172 addl %edi, %edx # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100173 rorl $2, %eax # b = rotl32(b,30)
174# 7
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100175 leal 0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100176 movl %eax, %edi # c
177 xorl %ebx, %edi # ^d
178 andl %ebp, %edi # &b
179 xorl %ebx, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100180 addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100181 movl %edx, %edi #
182 roll $5, %edi # rotl32(a,5)
183 addl %edi, %ecx # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100184 rorl $2, %ebp # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100185# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
186 movaps %xmm3, %xmm4
187 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100188# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
189# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
190# same result as above, but shorter and faster:
191# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
192# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
193 movaps %xmm0, %xmm5
194 shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100195 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
196 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
197 xorps %xmm5, %xmm0 # ^
198 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
199 movaps %xmm0, %xmm5
200 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +0100201 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
202 paddd %xmm0, %xmm0 # shift left by 1
203 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100204 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
205 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
206 movaps %xmm5, %xmm4
207 pslld $2, %xmm5
208 psrld $30, %xmm4
209# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
210 xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
211 xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
212 movaps %xmm0, %xmm5
213 paddd %xmm6, %xmm5
214 movups %xmm5, -64+16*0(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100215# 8
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100216 leal 0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n]
Denys Vlasenko205042c2022-01-25 17:00:57 +0100217 shrq $32, %r11
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100218 movl %ebp, %edi # c
219 xorl %eax, %edi # ^d
220 andl %edx, %edi # &b
221 xorl %eax, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100222 addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100223 movl %ecx, %edi #
224 roll $5, %edi # rotl32(a,5)
225 addl %edi, %ebx # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100226 rorl $2, %edx # b = rotl32(b,30)
227# 9
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100228 leal 0x5A827999(%rax,%r11), %eax # e += RCONST + W[n]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100229 movl %edx, %edi # c
230 xorl %ebp, %edi # ^d
231 andl %ecx, %edi # &b
232 xorl %ebp, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100233 addl %edi, %eax # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100234 movl %ebx, %edi #
235 roll $5, %edi # rotl32(a,5)
236 addl %edi, %eax # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100237 rorl $2, %ecx # b = rotl32(b,30)
238# 10
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100239 leal 0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n]
Denys Vlasenko205042c2022-01-25 17:00:57 +0100240 shrq $32, %r12
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100241 movl %ecx, %edi # c
242 xorl %edx, %edi # ^d
243 andl %ebx, %edi # &b
244 xorl %edx, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100245 addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100246 movl %eax, %edi #
247 roll $5, %edi # rotl32(a,5)
248 addl %edi, %ebp # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100249 rorl $2, %ebx # b = rotl32(b,30)
250# 11
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100251 leal 0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100252 movl %ebx, %edi # c
253 xorl %ecx, %edi # ^d
254 andl %eax, %edi # &b
255 xorl %ecx, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100256 addl %edi, %edx # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100257 movl %ebp, %edi #
258 roll $5, %edi # rotl32(a,5)
259 addl %edi, %edx # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100260 rorl $2, %eax # b = rotl32(b,30)
Denys Vlasenko4923f742022-02-08 03:29:16 +0100261 pshufd $0x55, %xmm7, %xmm6
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100262# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
263 movaps %xmm0, %xmm4
264 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100265# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
266# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
267# same result as above, but shorter and faster:
268# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
269# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
270 movaps %xmm1, %xmm5
271 shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100272 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
273 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
274 xorps %xmm5, %xmm1 # ^
275 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
276 movaps %xmm1, %xmm5
277 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +0100278 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
279 paddd %xmm1, %xmm1 # shift left by 1
280 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100281 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
282 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
283 movaps %xmm5, %xmm4
284 pslld $2, %xmm5
285 psrld $30, %xmm4
286# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
287 xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
288 xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
289 movaps %xmm1, %xmm5
290 paddd %xmm6, %xmm5
291 movups %xmm5, -64+16*1(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100292# 12
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100293 leal 0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n]
Denys Vlasenko205042c2022-01-25 17:00:57 +0100294 shrq $32, %r13
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100295 movl %eax, %edi # c
296 xorl %ebx, %edi # ^d
297 andl %ebp, %edi # &b
298 xorl %ebx, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100299 addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100300 movl %edx, %edi #
301 roll $5, %edi # rotl32(a,5)
302 addl %edi, %ecx # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100303 rorl $2, %ebp # b = rotl32(b,30)
304# 13
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100305 leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100306 movl %ebp, %edi # c
307 xorl %eax, %edi # ^d
308 andl %edx, %edi # &b
309 xorl %eax, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100310 addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100311 movl %ecx, %edi #
312 roll $5, %edi # rotl32(a,5)
313 addl %edi, %ebx # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100314 rorl $2, %edx # b = rotl32(b,30)
315# 14
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100316 leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
Denys Vlasenko205042c2022-01-25 17:00:57 +0100317 shrq $32, %r14
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100318 movl %edx, %edi # c
319 xorl %ebp, %edi # ^d
320 andl %ecx, %edi # &b
321 xorl %ebp, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100322 addl %edi, %eax # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100323 movl %ebx, %edi #
324 roll $5, %edi # rotl32(a,5)
325 addl %edi, %eax # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100326 rorl $2, %ecx # b = rotl32(b,30)
327# 15
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100328 leal 0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100329 movl %ecx, %edi # c
330 xorl %edx, %edi # ^d
331 andl %ebx, %edi # &b
332 xorl %edx, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100333 addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100334 movl %eax, %edi #
335 roll $5, %edi # rotl32(a,5)
336 addl %edi, %ebp # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100337 rorl $2, %ebx # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100338# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
339 movaps %xmm1, %xmm4
340 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100341# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
342# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
343# same result as above, but shorter and faster:
344# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
345# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
346 movaps %xmm2, %xmm5
347 shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100348 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
349 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
350 xorps %xmm5, %xmm2 # ^
351 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
352 movaps %xmm2, %xmm5
353 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +0100354 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
355 paddd %xmm2, %xmm2 # shift left by 1
356 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100357 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
358 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
359 movaps %xmm5, %xmm4
360 pslld $2, %xmm5
361 psrld $30, %xmm4
362# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
363 xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
364 xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
365 movaps %xmm2, %xmm5
366 paddd %xmm6, %xmm5
367 movups %xmm5, -64+16*2(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100368# 16
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100369 movl %ebx, %edi # c
370 xorl %ecx, %edi # ^d
371 andl %eax, %edi # &b
372 xorl %ecx, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100373 addl -64+4*0(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100374 addl %edi, %edx # e += (((c ^ d) & b) ^ d)
375 movl %ebp, %esi #
376 roll $5, %esi # rotl32(a,5)
377 addl %esi, %edx # e += rotl32(a,5)
378 rorl $2, %eax # b = rotl32(b,30)
379# 17
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100380 movl %eax, %edi # c
381 xorl %ebx, %edi # ^d
382 andl %ebp, %edi # &b
383 xorl %ebx, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100384 addl -64+4*1(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100385 addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
386 movl %edx, %esi #
387 roll $5, %esi # rotl32(a,5)
388 addl %esi, %ecx # e += rotl32(a,5)
389 rorl $2, %ebp # b = rotl32(b,30)
390# 18
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100391 movl %ebp, %edi # c
392 xorl %eax, %edi # ^d
393 andl %edx, %edi # &b
394 xorl %eax, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100395 addl -64+4*2(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100396 addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
397 movl %ecx, %esi #
398 roll $5, %esi # rotl32(a,5)
399 addl %esi, %ebx # e += rotl32(a,5)
400 rorl $2, %edx # b = rotl32(b,30)
401# 19
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100402 movl %edx, %edi # c
403 xorl %ebp, %edi # ^d
404 andl %ecx, %edi # &b
405 xorl %ebp, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100406 addl -64+4*3(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100407 addl %edi, %eax # e += (((c ^ d) & b) ^ d)
408 movl %ebx, %esi #
409 roll $5, %esi # rotl32(a,5)
410 addl %esi, %eax # e += rotl32(a,5)
411 rorl $2, %ecx # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100412# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
413 movaps %xmm2, %xmm4
414 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100415# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
416# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
417# same result as above, but shorter and faster:
418# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
419# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
420 movaps %xmm3, %xmm5
421 shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100422 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
423 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
424 xorps %xmm5, %xmm3 # ^
425 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
426 movaps %xmm3, %xmm5
427 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +0100428 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
429 paddd %xmm3, %xmm3 # shift left by 1
430 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100431 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
432 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
433 movaps %xmm5, %xmm4
434 pslld $2, %xmm5
435 psrld $30, %xmm4
436# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
437 xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
438 xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
439 movaps %xmm3, %xmm5
440 paddd %xmm6, %xmm5
441 movups %xmm5, -64+16*3(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100442# 20
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100443 movl %ecx, %edi # c
444 xorl %edx, %edi # ^d
445 xorl %ebx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100446 addl -64+4*4(%rsp), %ebp # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100447 addl %edi, %ebp # e += (c ^ d ^ b)
448 movl %eax, %esi #
449 roll $5, %esi # rotl32(a,5)
450 addl %esi, %ebp # e += rotl32(a,5)
451 rorl $2, %ebx # b = rotl32(b,30)
452# 21
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100453 movl %ebx, %edi # c
454 xorl %ecx, %edi # ^d
455 xorl %eax, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100456 addl -64+4*5(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100457 addl %edi, %edx # e += (c ^ d ^ b)
458 movl %ebp, %esi #
459 roll $5, %esi # rotl32(a,5)
460 addl %esi, %edx # e += rotl32(a,5)
461 rorl $2, %eax # b = rotl32(b,30)
462# 22
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100463 movl %eax, %edi # c
464 xorl %ebx, %edi # ^d
465 xorl %ebp, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100466 addl -64+4*6(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100467 addl %edi, %ecx # e += (c ^ d ^ b)
468 movl %edx, %esi #
469 roll $5, %esi # rotl32(a,5)
470 addl %esi, %ecx # e += rotl32(a,5)
471 rorl $2, %ebp # b = rotl32(b,30)
472# 23
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100473 movl %ebp, %edi # c
474 xorl %eax, %edi # ^d
475 xorl %edx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100476 addl -64+4*7(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100477 addl %edi, %ebx # e += (c ^ d ^ b)
478 movl %ecx, %esi #
479 roll $5, %esi # rotl32(a,5)
480 addl %esi, %ebx # e += rotl32(a,5)
481 rorl $2, %edx # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100482# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
483 movaps %xmm3, %xmm4
484 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100485# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
486# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
487# same result as above, but shorter and faster:
488# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
489# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
490 movaps %xmm0, %xmm5
491 shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100492 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
493 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
494 xorps %xmm5, %xmm0 # ^
495 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
496 movaps %xmm0, %xmm5
497 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +0100498 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
499 paddd %xmm0, %xmm0 # shift left by 1
500 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100501 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
502 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
503 movaps %xmm5, %xmm4
504 pslld $2, %xmm5
505 psrld $30, %xmm4
506# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
507 xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
508 xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
509 movaps %xmm0, %xmm5
510 paddd %xmm6, %xmm5
511 movups %xmm5, -64+16*0(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100512# 24
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100513 movl %edx, %edi # c
514 xorl %ebp, %edi # ^d
515 xorl %ecx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100516 addl -64+4*8(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100517 addl %edi, %eax # e += (c ^ d ^ b)
518 movl %ebx, %esi #
519 roll $5, %esi # rotl32(a,5)
520 addl %esi, %eax # e += rotl32(a,5)
521 rorl $2, %ecx # b = rotl32(b,30)
522# 25
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100523 movl %ecx, %edi # c
524 xorl %edx, %edi # ^d
525 xorl %ebx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100526 addl -64+4*9(%rsp), %ebp # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100527 addl %edi, %ebp # e += (c ^ d ^ b)
528 movl %eax, %esi #
529 roll $5, %esi # rotl32(a,5)
530 addl %esi, %ebp # e += rotl32(a,5)
531 rorl $2, %ebx # b = rotl32(b,30)
532# 26
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100533 movl %ebx, %edi # c
534 xorl %ecx, %edi # ^d
535 xorl %eax, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100536 addl -64+4*10(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100537 addl %edi, %edx # e += (c ^ d ^ b)
538 movl %ebp, %esi #
539 roll $5, %esi # rotl32(a,5)
540 addl %esi, %edx # e += rotl32(a,5)
541 rorl $2, %eax # b = rotl32(b,30)
542# 27
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100543 movl %eax, %edi # c
544 xorl %ebx, %edi # ^d
545 xorl %ebp, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100546 addl -64+4*11(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100547 addl %edi, %ecx # e += (c ^ d ^ b)
548 movl %edx, %esi #
549 roll $5, %esi # rotl32(a,5)
550 addl %esi, %ecx # e += rotl32(a,5)
551 rorl $2, %ebp # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100552# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
553 movaps %xmm0, %xmm4
554 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100555# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
556# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
557# same result as above, but shorter and faster:
558# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
559# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
560 movaps %xmm1, %xmm5
561 shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100562 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
563 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
564 xorps %xmm5, %xmm1 # ^
565 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
566 movaps %xmm1, %xmm5
567 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +0100568 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
569 paddd %xmm1, %xmm1 # shift left by 1
570 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100571 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
572 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
573 movaps %xmm5, %xmm4
574 pslld $2, %xmm5
575 psrld $30, %xmm4
576# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
577 xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
578 xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
579 movaps %xmm1, %xmm5
580 paddd %xmm6, %xmm5
581 movups %xmm5, -64+16*1(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100582# 28
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100583 movl %ebp, %edi # c
584 xorl %eax, %edi # ^d
585 xorl %edx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100586 addl -64+4*12(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100587 addl %edi, %ebx # e += (c ^ d ^ b)
588 movl %ecx, %esi #
589 roll $5, %esi # rotl32(a,5)
590 addl %esi, %ebx # e += rotl32(a,5)
591 rorl $2, %edx # b = rotl32(b,30)
592# 29
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100593 movl %edx, %edi # c
594 xorl %ebp, %edi # ^d
595 xorl %ecx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100596 addl -64+4*13(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100597 addl %edi, %eax # e += (c ^ d ^ b)
598 movl %ebx, %esi #
599 roll $5, %esi # rotl32(a,5)
600 addl %esi, %eax # e += rotl32(a,5)
601 rorl $2, %ecx # b = rotl32(b,30)
602# 30
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100603 movl %ecx, %edi # c
604 xorl %edx, %edi # ^d
605 xorl %ebx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100606 addl -64+4*14(%rsp), %ebp # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100607 addl %edi, %ebp # e += (c ^ d ^ b)
608 movl %eax, %esi #
609 roll $5, %esi # rotl32(a,5)
610 addl %esi, %ebp # e += rotl32(a,5)
611 rorl $2, %ebx # b = rotl32(b,30)
612# 31
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100613 movl %ebx, %edi # c
614 xorl %ecx, %edi # ^d
615 xorl %eax, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100616 addl -64+4*15(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100617 addl %edi, %edx # e += (c ^ d ^ b)
618 movl %ebp, %esi #
619 roll $5, %esi # rotl32(a,5)
620 addl %esi, %edx # e += rotl32(a,5)
621 rorl $2, %eax # b = rotl32(b,30)
Denys Vlasenko4923f742022-02-08 03:29:16 +0100622 pshufd $0xaa, %xmm7, %xmm6
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100623# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
624 movaps %xmm1, %xmm4
625 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100626# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
627# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
628# same result as above, but shorter and faster:
629# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
630# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
631 movaps %xmm2, %xmm5
632 shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100633 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
634 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
635 xorps %xmm5, %xmm2 # ^
636 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
637 movaps %xmm2, %xmm5
638 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +0100639 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
640 paddd %xmm2, %xmm2 # shift left by 1
641 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100642 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
643 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
644 movaps %xmm5, %xmm4
645 pslld $2, %xmm5
646 psrld $30, %xmm4
647# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
648 xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
649 xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
650 movaps %xmm2, %xmm5
651 paddd %xmm6, %xmm5
652 movups %xmm5, -64+16*2(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100653# 32
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100654 movl %eax, %edi # c
655 xorl %ebx, %edi # ^d
656 xorl %ebp, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100657 addl -64+4*0(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100658 addl %edi, %ecx # e += (c ^ d ^ b)
659 movl %edx, %esi #
660 roll $5, %esi # rotl32(a,5)
661 addl %esi, %ecx # e += rotl32(a,5)
662 rorl $2, %ebp # b = rotl32(b,30)
663# 33
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100664 movl %ebp, %edi # c
665 xorl %eax, %edi # ^d
666 xorl %edx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100667 addl -64+4*1(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100668 addl %edi, %ebx # e += (c ^ d ^ b)
669 movl %ecx, %esi #
670 roll $5, %esi # rotl32(a,5)
671 addl %esi, %ebx # e += rotl32(a,5)
672 rorl $2, %edx # b = rotl32(b,30)
673# 34
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100674 movl %edx, %edi # c
675 xorl %ebp, %edi # ^d
676 xorl %ecx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100677 addl -64+4*2(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100678 addl %edi, %eax # e += (c ^ d ^ b)
679 movl %ebx, %esi #
680 roll $5, %esi # rotl32(a,5)
681 addl %esi, %eax # e += rotl32(a,5)
682 rorl $2, %ecx # b = rotl32(b,30)
683# 35
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100684 movl %ecx, %edi # c
685 xorl %edx, %edi # ^d
686 xorl %ebx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100687 addl -64+4*3(%rsp), %ebp # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100688 addl %edi, %ebp # e += (c ^ d ^ b)
689 movl %eax, %esi #
690 roll $5, %esi # rotl32(a,5)
691 addl %esi, %ebp # e += rotl32(a,5)
692 rorl $2, %ebx # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100693# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
694 movaps %xmm2, %xmm4
695 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100696# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
697# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
698# same result as above, but shorter and faster:
699# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
700# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
701 movaps %xmm3, %xmm5
702 shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100703 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
704 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
705 xorps %xmm5, %xmm3 # ^
706 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
707 movaps %xmm3, %xmm5
708 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +0100709 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
710 paddd %xmm3, %xmm3 # shift left by 1
711 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100712 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
713 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
714 movaps %xmm5, %xmm4
715 pslld $2, %xmm5
716 psrld $30, %xmm4
717# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
718 xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
719 xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
720 movaps %xmm3, %xmm5
721 paddd %xmm6, %xmm5
722 movups %xmm5, -64+16*3(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100723# 36
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100724 movl %ebx, %edi # c
725 xorl %ecx, %edi # ^d
726 xorl %eax, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100727 addl -64+4*4(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100728 addl %edi, %edx # e += (c ^ d ^ b)
729 movl %ebp, %esi #
730 roll $5, %esi # rotl32(a,5)
731 addl %esi, %edx # e += rotl32(a,5)
732 rorl $2, %eax # b = rotl32(b,30)
733# 37
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100734 movl %eax, %edi # c
735 xorl %ebx, %edi # ^d
736 xorl %ebp, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100737 addl -64+4*5(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100738 addl %edi, %ecx # e += (c ^ d ^ b)
739 movl %edx, %esi #
740 roll $5, %esi # rotl32(a,5)
741 addl %esi, %ecx # e += rotl32(a,5)
742 rorl $2, %ebp # b = rotl32(b,30)
743# 38
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100744 movl %ebp, %edi # c
745 xorl %eax, %edi # ^d
746 xorl %edx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100747 addl -64+4*6(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100748 addl %edi, %ebx # e += (c ^ d ^ b)
749 movl %ecx, %esi #
750 roll $5, %esi # rotl32(a,5)
751 addl %esi, %ebx # e += rotl32(a,5)
752 rorl $2, %edx # b = rotl32(b,30)
753# 39
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100754 movl %edx, %edi # c
755 xorl %ebp, %edi # ^d
756 xorl %ecx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100757 addl -64+4*7(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100758 addl %edi, %eax # e += (c ^ d ^ b)
759 movl %ebx, %esi #
760 roll $5, %esi # rotl32(a,5)
761 addl %esi, %eax # e += rotl32(a,5)
762 rorl $2, %ecx # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100763# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
764 movaps %xmm3, %xmm4
765 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100766# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
767# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
768# same result as above, but shorter and faster:
769# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
770# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
771 movaps %xmm0, %xmm5
772 shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100773 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
774 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
775 xorps %xmm5, %xmm0 # ^
776 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
777 movaps %xmm0, %xmm5
778 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +0100779 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
780 paddd %xmm0, %xmm0 # shift left by 1
781 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100782 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
783 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
784 movaps %xmm5, %xmm4
785 pslld $2, %xmm5
786 psrld $30, %xmm4
787# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
788 xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
789 xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
790 movaps %xmm0, %xmm5
791 paddd %xmm6, %xmm5
792 movups %xmm5, -64+16*0(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100793# 40
794 movl %ebx, %edi # di: b
795 movl %ebx, %esi # si: b
796 orl %ecx, %edi # di: b | c
797 andl %ecx, %esi # si: b & c
798 andl %edx, %edi # di: (b | c) & d
799 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100800 addl %edi, %ebp # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100801 addl -64+4*8(%rsp), %ebp # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100802 movl %eax, %esi #
803 roll $5, %esi # rotl32(a,5)
804 addl %esi, %ebp # e += rotl32(a,5)
805 rorl $2, %ebx # b = rotl32(b,30)
806# 41
807 movl %eax, %edi # di: b
808 movl %eax, %esi # si: b
809 orl %ebx, %edi # di: b | c
810 andl %ebx, %esi # si: b & c
811 andl %ecx, %edi # di: (b | c) & d
812 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100813 addl %edi, %edx # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100814 addl -64+4*9(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100815 movl %ebp, %esi #
816 roll $5, %esi # rotl32(a,5)
817 addl %esi, %edx # e += rotl32(a,5)
818 rorl $2, %eax # b = rotl32(b,30)
819# 42
820 movl %ebp, %edi # di: b
821 movl %ebp, %esi # si: b
822 orl %eax, %edi # di: b | c
823 andl %eax, %esi # si: b & c
824 andl %ebx, %edi # di: (b | c) & d
825 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100826 addl %edi, %ecx # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100827 addl -64+4*10(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100828 movl %edx, %esi #
829 roll $5, %esi # rotl32(a,5)
830 addl %esi, %ecx # e += rotl32(a,5)
831 rorl $2, %ebp # b = rotl32(b,30)
832# 43
833 movl %edx, %edi # di: b
834 movl %edx, %esi # si: b
835 orl %ebp, %edi # di: b | c
836 andl %ebp, %esi # si: b & c
837 andl %eax, %edi # di: (b | c) & d
838 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100839 addl %edi, %ebx # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100840 addl -64+4*11(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100841 movl %ecx, %esi #
842 roll $5, %esi # rotl32(a,5)
843 addl %esi, %ebx # e += rotl32(a,5)
844 rorl $2, %edx # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100845# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
846 movaps %xmm0, %xmm4
847 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100848# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
849# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
850# same result as above, but shorter and faster:
851# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
852# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
853 movaps %xmm1, %xmm5
854 shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100855 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
856 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
857 xorps %xmm5, %xmm1 # ^
858 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
859 movaps %xmm1, %xmm5
860 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +0100861 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
862 paddd %xmm1, %xmm1 # shift left by 1
863 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100864 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
865 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
866 movaps %xmm5, %xmm4
867 pslld $2, %xmm5
868 psrld $30, %xmm4
869# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
870 xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
871 xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
872 movaps %xmm1, %xmm5
873 paddd %xmm6, %xmm5
874 movups %xmm5, -64+16*1(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100875# 44
876 movl %ecx, %edi # di: b
877 movl %ecx, %esi # si: b
878 orl %edx, %edi # di: b | c
879 andl %edx, %esi # si: b & c
880 andl %ebp, %edi # di: (b | c) & d
881 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100882 addl %edi, %eax # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100883 addl -64+4*12(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100884 movl %ebx, %esi #
885 roll $5, %esi # rotl32(a,5)
886 addl %esi, %eax # e += rotl32(a,5)
887 rorl $2, %ecx # b = rotl32(b,30)
888# 45
889 movl %ebx, %edi # di: b
890 movl %ebx, %esi # si: b
891 orl %ecx, %edi # di: b | c
892 andl %ecx, %esi # si: b & c
893 andl %edx, %edi # di: (b | c) & d
894 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100895 addl %edi, %ebp # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100896 addl -64+4*13(%rsp), %ebp # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100897 movl %eax, %esi #
898 roll $5, %esi # rotl32(a,5)
899 addl %esi, %ebp # e += rotl32(a,5)
900 rorl $2, %ebx # b = rotl32(b,30)
901# 46
902 movl %eax, %edi # di: b
903 movl %eax, %esi # si: b
904 orl %ebx, %edi # di: b | c
905 andl %ebx, %esi # si: b & c
906 andl %ecx, %edi # di: (b | c) & d
907 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100908 addl %edi, %edx # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100909 addl -64+4*14(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100910 movl %ebp, %esi #
911 roll $5, %esi # rotl32(a,5)
912 addl %esi, %edx # e += rotl32(a,5)
913 rorl $2, %eax # b = rotl32(b,30)
914# 47
915 movl %ebp, %edi # di: b
916 movl %ebp, %esi # si: b
917 orl %eax, %edi # di: b | c
918 andl %eax, %esi # si: b & c
919 andl %ebx, %edi # di: (b | c) & d
920 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100921 addl %edi, %ecx # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100922 addl -64+4*15(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100923 movl %edx, %esi #
924 roll $5, %esi # rotl32(a,5)
925 addl %esi, %ecx # e += rotl32(a,5)
926 rorl $2, %ebp # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100927# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
928 movaps %xmm1, %xmm4
929 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100930# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
931# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
932# same result as above, but shorter and faster:
933# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
934# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
935 movaps %xmm2, %xmm5
936 shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100937 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
938 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
939 xorps %xmm5, %xmm2 # ^
940 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
941 movaps %xmm2, %xmm5
942 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +0100943 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
944 paddd %xmm2, %xmm2 # shift left by 1
945 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100946 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
947 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
948 movaps %xmm5, %xmm4
949 pslld $2, %xmm5
950 psrld $30, %xmm4
951# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
952 xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
953 xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
954 movaps %xmm2, %xmm5
955 paddd %xmm6, %xmm5
956 movups %xmm5, -64+16*2(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100957# 48
958 movl %edx, %edi # di: b
959 movl %edx, %esi # si: b
960 orl %ebp, %edi # di: b | c
961 andl %ebp, %esi # si: b & c
962 andl %eax, %edi # di: (b | c) & d
963 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100964 addl %edi, %ebx # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100965 addl -64+4*0(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100966 movl %ecx, %esi #
967 roll $5, %esi # rotl32(a,5)
968 addl %esi, %ebx # e += rotl32(a,5)
969 rorl $2, %edx # b = rotl32(b,30)
970# 49
971 movl %ecx, %edi # di: b
972 movl %ecx, %esi # si: b
973 orl %edx, %edi # di: b | c
974 andl %edx, %esi # si: b & c
975 andl %ebp, %edi # di: (b | c) & d
976 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100977 addl %edi, %eax # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100978 addl -64+4*1(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100979 movl %ebx, %esi #
980 roll $5, %esi # rotl32(a,5)
981 addl %esi, %eax # e += rotl32(a,5)
982 rorl $2, %ecx # b = rotl32(b,30)
983# 50
984 movl %ebx, %edi # di: b
985 movl %ebx, %esi # si: b
986 orl %ecx, %edi # di: b | c
987 andl %ecx, %esi # si: b & c
988 andl %edx, %edi # di: (b | c) & d
989 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100990 addl %edi, %ebp # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100991 addl -64+4*2(%rsp), %ebp # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100992 movl %eax, %esi #
993 roll $5, %esi # rotl32(a,5)
994 addl %esi, %ebp # e += rotl32(a,5)
995 rorl $2, %ebx # b = rotl32(b,30)
996# 51
997 movl %eax, %edi # di: b
998 movl %eax, %esi # si: b
999 orl %ebx, %edi # di: b | c
1000 andl %ebx, %esi # si: b & c
1001 andl %ecx, %edi # di: (b | c) & d
1002 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001003 addl %edi, %edx # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001004 addl -64+4*3(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001005 movl %ebp, %esi #
1006 roll $5, %esi # rotl32(a,5)
1007 addl %esi, %edx # e += rotl32(a,5)
1008 rorl $2, %eax # b = rotl32(b,30)
Denys Vlasenko4923f742022-02-08 03:29:16 +01001009 pshufd $0xff, %xmm7, %xmm6
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001010# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
1011 movaps %xmm2, %xmm4
1012 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +01001013# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1014# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1015# same result as above, but shorter and faster:
1016# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1017# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1018 movaps %xmm3, %xmm5
1019 shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001020 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1021 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1022 xorps %xmm5, %xmm3 # ^
1023 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1024 movaps %xmm3, %xmm5
1025 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +01001026 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1027 paddd %xmm3, %xmm3 # shift left by 1
1028 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001029 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1030 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1031 movaps %xmm5, %xmm4
1032 pslld $2, %xmm5
1033 psrld $30, %xmm4
1034# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1035 xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
1036 xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1037 movaps %xmm3, %xmm5
1038 paddd %xmm6, %xmm5
1039 movups %xmm5, -64+16*3(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001040# 52
1041 movl %ebp, %edi # di: b
1042 movl %ebp, %esi # si: b
1043 orl %eax, %edi # di: b | c
1044 andl %eax, %esi # si: b & c
1045 andl %ebx, %edi # di: (b | c) & d
1046 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001047 addl %edi, %ecx # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001048 addl -64+4*4(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001049 movl %edx, %esi #
1050 roll $5, %esi # rotl32(a,5)
1051 addl %esi, %ecx # e += rotl32(a,5)
1052 rorl $2, %ebp # b = rotl32(b,30)
1053# 53
1054 movl %edx, %edi # di: b
1055 movl %edx, %esi # si: b
1056 orl %ebp, %edi # di: b | c
1057 andl %ebp, %esi # si: b & c
1058 andl %eax, %edi # di: (b | c) & d
1059 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001060 addl %edi, %ebx # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001061 addl -64+4*5(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001062 movl %ecx, %esi #
1063 roll $5, %esi # rotl32(a,5)
1064 addl %esi, %ebx # e += rotl32(a,5)
1065 rorl $2, %edx # b = rotl32(b,30)
1066# 54
1067 movl %ecx, %edi # di: b
1068 movl %ecx, %esi # si: b
1069 orl %edx, %edi # di: b | c
1070 andl %edx, %esi # si: b & c
1071 andl %ebp, %edi # di: (b | c) & d
1072 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001073 addl %edi, %eax # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001074 addl -64+4*6(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001075 movl %ebx, %esi #
1076 roll $5, %esi # rotl32(a,5)
1077 addl %esi, %eax # e += rotl32(a,5)
1078 rorl $2, %ecx # b = rotl32(b,30)
1079# 55
1080 movl %ebx, %edi # di: b
1081 movl %ebx, %esi # si: b
1082 orl %ecx, %edi # di: b | c
1083 andl %ecx, %esi # si: b & c
1084 andl %edx, %edi # di: (b | c) & d
1085 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001086 addl %edi, %ebp # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001087 addl -64+4*7(%rsp), %ebp # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001088 movl %eax, %esi #
1089 roll $5, %esi # rotl32(a,5)
1090 addl %esi, %ebp # e += rotl32(a,5)
1091 rorl $2, %ebx # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001092# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
1093 movaps %xmm3, %xmm4
1094 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +01001095# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1096# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1097# same result as above, but shorter and faster:
1098# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1099# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1100 movaps %xmm0, %xmm5
1101 shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001102 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1103 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1104 xorps %xmm5, %xmm0 # ^
1105 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1106 movaps %xmm0, %xmm5
1107 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +01001108 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1109 paddd %xmm0, %xmm0 # shift left by 1
1110 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001111 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1112 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1113 movaps %xmm5, %xmm4
1114 pslld $2, %xmm5
1115 psrld $30, %xmm4
1116# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1117 xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
1118 xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1119 movaps %xmm0, %xmm5
1120 paddd %xmm6, %xmm5
1121 movups %xmm5, -64+16*0(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001122# 56
1123 movl %eax, %edi # di: b
1124 movl %eax, %esi # si: b
1125 orl %ebx, %edi # di: b | c
1126 andl %ebx, %esi # si: b & c
1127 andl %ecx, %edi # di: (b | c) & d
1128 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001129 addl %edi, %edx # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001130 addl -64+4*8(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001131 movl %ebp, %esi #
1132 roll $5, %esi # rotl32(a,5)
1133 addl %esi, %edx # e += rotl32(a,5)
1134 rorl $2, %eax # b = rotl32(b,30)
1135# 57
1136 movl %ebp, %edi # di: b
1137 movl %ebp, %esi # si: b
1138 orl %eax, %edi # di: b | c
1139 andl %eax, %esi # si: b & c
1140 andl %ebx, %edi # di: (b | c) & d
1141 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001142 addl %edi, %ecx # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001143 addl -64+4*9(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001144 movl %edx, %esi #
1145 roll $5, %esi # rotl32(a,5)
1146 addl %esi, %ecx # e += rotl32(a,5)
1147 rorl $2, %ebp # b = rotl32(b,30)
1148# 58
1149 movl %edx, %edi # di: b
1150 movl %edx, %esi # si: b
1151 orl %ebp, %edi # di: b | c
1152 andl %ebp, %esi # si: b & c
1153 andl %eax, %edi # di: (b | c) & d
1154 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001155 addl %edi, %ebx # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001156 addl -64+4*10(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001157 movl %ecx, %esi #
1158 roll $5, %esi # rotl32(a,5)
1159 addl %esi, %ebx # e += rotl32(a,5)
1160 rorl $2, %edx # b = rotl32(b,30)
1161# 59
1162 movl %ecx, %edi # di: b
1163 movl %ecx, %esi # si: b
1164 orl %edx, %edi # di: b | c
1165 andl %edx, %esi # si: b & c
1166 andl %ebp, %edi # di: (b | c) & d
1167 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001168 addl %edi, %eax # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001169 addl -64+4*11(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001170 movl %ebx, %esi #
1171 roll $5, %esi # rotl32(a,5)
1172 addl %esi, %eax # e += rotl32(a,5)
1173 rorl $2, %ecx # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001174# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
1175 movaps %xmm0, %xmm4
1176 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +01001177# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1178# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1179# same result as above, but shorter and faster:
1180# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1181# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1182 movaps %xmm1, %xmm5
1183 shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001184 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1185 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1186 xorps %xmm5, %xmm1 # ^
1187 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1188 movaps %xmm1, %xmm5
1189 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +01001190 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1191 paddd %xmm1, %xmm1 # shift left by 1
1192 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001193 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1194 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1195 movaps %xmm5, %xmm4
1196 pslld $2, %xmm5
1197 psrld $30, %xmm4
1198# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1199 xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
1200 xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1201 movaps %xmm1, %xmm5
1202 paddd %xmm6, %xmm5
1203 movups %xmm5, -64+16*1(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001204# 60
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001205 movl %ecx, %edi # c
1206 xorl %edx, %edi # ^d
1207 xorl %ebx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001208 addl -64+4*12(%rsp), %ebp # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001209 addl %edi, %ebp # e += (c ^ d ^ b)
1210 movl %eax, %esi #
1211 roll $5, %esi # rotl32(a,5)
1212 addl %esi, %ebp # e += rotl32(a,5)
1213 rorl $2, %ebx # b = rotl32(b,30)
1214# 61
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001215 movl %ebx, %edi # c
1216 xorl %ecx, %edi # ^d
1217 xorl %eax, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001218 addl -64+4*13(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001219 addl %edi, %edx # e += (c ^ d ^ b)
1220 movl %ebp, %esi #
1221 roll $5, %esi # rotl32(a,5)
1222 addl %esi, %edx # e += rotl32(a,5)
1223 rorl $2, %eax # b = rotl32(b,30)
1224# 62
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001225 movl %eax, %edi # c
1226 xorl %ebx, %edi # ^d
1227 xorl %ebp, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001228 addl -64+4*14(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001229 addl %edi, %ecx # e += (c ^ d ^ b)
1230 movl %edx, %esi #
1231 roll $5, %esi # rotl32(a,5)
1232 addl %esi, %ecx # e += rotl32(a,5)
1233 rorl $2, %ebp # b = rotl32(b,30)
1234# 63
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001235 movl %ebp, %edi # c
1236 xorl %eax, %edi # ^d
1237 xorl %edx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001238 addl -64+4*15(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001239 addl %edi, %ebx # e += (c ^ d ^ b)
1240 movl %ecx, %esi #
1241 roll $5, %esi # rotl32(a,5)
1242 addl %esi, %ebx # e += rotl32(a,5)
1243 rorl $2, %edx # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001244# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
1245 movaps %xmm1, %xmm4
1246 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +01001247# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1248# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1249# same result as above, but shorter and faster:
1250# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1251# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1252 movaps %xmm2, %xmm5
1253 shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001254 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1255 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1256 xorps %xmm5, %xmm2 # ^
1257 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1258 movaps %xmm2, %xmm5
1259 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +01001260 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1261 paddd %xmm2, %xmm2 # shift left by 1
1262 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001263 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1264 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1265 movaps %xmm5, %xmm4
1266 pslld $2, %xmm5
1267 psrld $30, %xmm4
1268# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1269 xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
1270 xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1271 movaps %xmm2, %xmm5
1272 paddd %xmm6, %xmm5
1273 movups %xmm5, -64+16*2(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001274# 64
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001275 movl %edx, %edi # c
1276 xorl %ebp, %edi # ^d
1277 xorl %ecx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001278 addl -64+4*0(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001279 addl %edi, %eax # e += (c ^ d ^ b)
1280 movl %ebx, %esi #
1281 roll $5, %esi # rotl32(a,5)
1282 addl %esi, %eax # e += rotl32(a,5)
1283 rorl $2, %ecx # b = rotl32(b,30)
1284# 65
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001285 movl %ecx, %edi # c
1286 xorl %edx, %edi # ^d
1287 xorl %ebx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001288 addl -64+4*1(%rsp), %ebp # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001289 addl %edi, %ebp # e += (c ^ d ^ b)
1290 movl %eax, %esi #
1291 roll $5, %esi # rotl32(a,5)
1292 addl %esi, %ebp # e += rotl32(a,5)
1293 rorl $2, %ebx # b = rotl32(b,30)
1294# 66
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001295 movl %ebx, %edi # c
1296 xorl %ecx, %edi # ^d
1297 xorl %eax, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001298 addl -64+4*2(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001299 addl %edi, %edx # e += (c ^ d ^ b)
1300 movl %ebp, %esi #
1301 roll $5, %esi # rotl32(a,5)
1302 addl %esi, %edx # e += rotl32(a,5)
1303 rorl $2, %eax # b = rotl32(b,30)
1304# 67
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001305 movl %eax, %edi # c
1306 xorl %ebx, %edi # ^d
1307 xorl %ebp, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001308 addl -64+4*3(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001309 addl %edi, %ecx # e += (c ^ d ^ b)
1310 movl %edx, %esi #
1311 roll $5, %esi # rotl32(a,5)
1312 addl %esi, %ecx # e += rotl32(a,5)
1313 rorl $2, %ebp # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001314# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
1315 movaps %xmm2, %xmm4
1316 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +01001317# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1318# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1319# same result as above, but shorter and faster:
1320# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1321# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1322 movaps %xmm3, %xmm5
1323 shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001324 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1325 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1326 xorps %xmm5, %xmm3 # ^
1327 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1328 movaps %xmm3, %xmm5
1329 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +01001330 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1331 paddd %xmm3, %xmm3 # shift left by 1
1332 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001333 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1334 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1335 movaps %xmm5, %xmm4
1336 pslld $2, %xmm5
1337 psrld $30, %xmm4
1338# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1339 xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
1340 xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1341 movaps %xmm3, %xmm5
1342 paddd %xmm6, %xmm5
1343 movups %xmm5, -64+16*3(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001344# 68
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001345 movl %ebp, %edi # c
1346 xorl %eax, %edi # ^d
1347 xorl %edx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001348 addl -64+4*4(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001349 addl %edi, %ebx # e += (c ^ d ^ b)
1350 movl %ecx, %esi #
1351 roll $5, %esi # rotl32(a,5)
1352 addl %esi, %ebx # e += rotl32(a,5)
1353 rorl $2, %edx # b = rotl32(b,30)
1354# 69
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001355 movl %edx, %edi # c
1356 xorl %ebp, %edi # ^d
1357 xorl %ecx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001358 addl -64+4*5(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001359 addl %edi, %eax # e += (c ^ d ^ b)
1360 movl %ebx, %esi #
1361 roll $5, %esi # rotl32(a,5)
1362 addl %esi, %eax # e += rotl32(a,5)
1363 rorl $2, %ecx # b = rotl32(b,30)
1364# 70
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001365 movl %ecx, %edi # c
1366 xorl %edx, %edi # ^d
1367 xorl %ebx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001368 addl -64+4*6(%rsp), %ebp # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001369 addl %edi, %ebp # e += (c ^ d ^ b)
1370 movl %eax, %esi #
1371 roll $5, %esi # rotl32(a,5)
1372 addl %esi, %ebp # e += rotl32(a,5)
1373 rorl $2, %ebx # b = rotl32(b,30)
1374# 71
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001375 movl %ebx, %edi # c
1376 xorl %ecx, %edi # ^d
1377 xorl %eax, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001378 addl -64+4*7(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001379 addl %edi, %edx # e += (c ^ d ^ b)
1380 movl %ebp, %esi #
1381 roll $5, %esi # rotl32(a,5)
1382 addl %esi, %edx # e += rotl32(a,5)
1383 rorl $2, %eax # b = rotl32(b,30)
1384# 72
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001385 movl %eax, %edi # c
1386 xorl %ebx, %edi # ^d
1387 xorl %ebp, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001388 addl -64+4*8(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001389 addl %edi, %ecx # e += (c ^ d ^ b)
1390 movl %edx, %esi #
1391 roll $5, %esi # rotl32(a,5)
1392 addl %esi, %ecx # e += rotl32(a,5)
1393 rorl $2, %ebp # b = rotl32(b,30)
1394# 73
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001395 movl %ebp, %edi # c
1396 xorl %eax, %edi # ^d
1397 xorl %edx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001398 addl -64+4*9(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001399 addl %edi, %ebx # e += (c ^ d ^ b)
1400 movl %ecx, %esi #
1401 roll $5, %esi # rotl32(a,5)
1402 addl %esi, %ebx # e += rotl32(a,5)
1403 rorl $2, %edx # b = rotl32(b,30)
1404# 74
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001405 movl %edx, %edi # c
1406 xorl %ebp, %edi # ^d
1407 xorl %ecx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001408 addl -64+4*10(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001409 addl %edi, %eax # e += (c ^ d ^ b)
1410 movl %ebx, %esi #
1411 roll $5, %esi # rotl32(a,5)
1412 addl %esi, %eax # e += rotl32(a,5)
1413 rorl $2, %ecx # b = rotl32(b,30)
1414# 75
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001415 movl %ecx, %edi # c
1416 xorl %edx, %edi # ^d
1417 xorl %ebx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001418 addl -64+4*11(%rsp), %ebp # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001419 addl %edi, %ebp # e += (c ^ d ^ b)
1420 movl %eax, %esi #
1421 roll $5, %esi # rotl32(a,5)
1422 addl %esi, %ebp # e += rotl32(a,5)
1423 rorl $2, %ebx # b = rotl32(b,30)
1424# 76
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001425 movl %ebx, %edi # c
1426 xorl %ecx, %edi # ^d
1427 xorl %eax, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001428 addl -64+4*12(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001429 addl %edi, %edx # e += (c ^ d ^ b)
1430 movl %ebp, %esi #
1431 roll $5, %esi # rotl32(a,5)
1432 addl %esi, %edx # e += rotl32(a,5)
1433 rorl $2, %eax # b = rotl32(b,30)
1434# 77
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001435 movl %eax, %edi # c
1436 xorl %ebx, %edi # ^d
1437 xorl %ebp, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001438 addl -64+4*13(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001439 addl %edi, %ecx # e += (c ^ d ^ b)
1440 movl %edx, %esi #
1441 roll $5, %esi # rotl32(a,5)
1442 addl %esi, %ecx # e += rotl32(a,5)
1443 rorl $2, %ebp # b = rotl32(b,30)
1444# 78
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001445 movl %ebp, %edi # c
1446 xorl %eax, %edi # ^d
1447 xorl %edx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001448 addl -64+4*14(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001449 addl %edi, %ebx # e += (c ^ d ^ b)
1450 movl %ecx, %esi #
1451 roll $5, %esi # rotl32(a,5)
1452 addl %esi, %ebx # e += rotl32(a,5)
1453 rorl $2, %edx # b = rotl32(b,30)
1454# 79
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001455 movl %edx, %edi # c
1456 xorl %ebp, %edi # ^d
1457 xorl %ecx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001458 addl -64+4*15(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001459 addl %edi, %eax # e += (c ^ d ^ b)
1460 movl %ebx, %esi #
1461 roll $5, %esi # rotl32(a,5)
1462 addl %esi, %eax # e += rotl32(a,5)
1463 rorl $2, %ecx # b = rotl32(b,30)
1464
1465 popq %rdi #
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001466 popq %r12 #
Denys Vlasenko805ecec2022-01-08 00:41:09 +01001467 addl %eax, 80(%rdi) # ctx->hash[0] += a
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001468 popq %r13 #
Denys Vlasenko805ecec2022-01-08 00:41:09 +01001469 addl %ebx, 84(%rdi) # ctx->hash[1] += b
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001470 popq %r14 #
Denys Vlasenko805ecec2022-01-08 00:41:09 +01001471 addl %ecx, 88(%rdi) # ctx->hash[2] += c
Denys Vlasenko205042c2022-01-25 17:00:57 +01001472# popq %r15 #
Denys Vlasenko805ecec2022-01-08 00:41:09 +01001473 addl %edx, 92(%rdi) # ctx->hash[3] += d
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +01001474 popq %rbx #
Denys Vlasenko805ecec2022-01-08 00:41:09 +01001475 addl %ebp, 96(%rdi) # ctx->hash[4] += e
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +01001476 popq %rbp #
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001477
1478 ret
1479 .size sha1_process_block64, .-sha1_process_block64
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001480
1481 .section .rodata.cst16.sha1const, "aM", @progbits, 16
Denys Vlasenko6472ac92022-02-03 14:15:20 +01001482 .balign 16
Denys Vlasenko4923f742022-02-08 03:29:16 +01001483sha1const:
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001484 .long 0x5A827999
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001485 .long 0x6ED9EBA1
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001486 .long 0x8F1BBCDC
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001487 .long 0xCA62C1D6
1488
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001489#endif