blob: 1d55b91f84802d2f9b9c5d2849eee0dddf149131 [file] [log] [blame]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001### Generated by hash_md5_sha_x86-64.S.sh ###
Denys Vlasenko947bef02022-01-03 13:00:07 +01002
3#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
Denys Vlasenko205042c2022-01-25 17:00:57 +01004 .section .text.sha1_process_block64, "ax", @progbits
Denys Vlasenko805ecec2022-01-08 00:41:09 +01005 .globl sha1_process_block64
6 .hidden sha1_process_block64
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01007 .type sha1_process_block64, @function
Denys Vlasenko947bef02022-01-03 13:00:07 +01008
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +01009 .balign 8 # allow decoders to fetch at least 5 first insns
Denys Vlasenko05fd13e2022-01-03 01:57:29 +010010sha1_process_block64:
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +010011 pushq %rbp # 1 byte insn
12 pushq %rbx # 1 byte insn
Denys Vlasenko205042c2022-01-25 17:00:57 +010013# pushq %r15 # 2 byte insn
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +010014 pushq %r14 # 2 byte insn
15 pushq %r13 # 2 byte insn
16 pushq %r12 # 2 byte insn
Denys Vlasenko947bef02022-01-03 13:00:07 +010017 pushq %rdi # we need ctx at the end
Denys Vlasenko05fd13e2022-01-03 01:57:29 +010018
19#Register and stack use:
20# eax..edx: a..d
21# ebp: e
Denys Vlasenko205042c2022-01-25 17:00:57 +010022# esi,edi,r8..r14: temps
23# r15: unused
Denys Vlasenko39369ff2022-01-23 09:27:30 +010024# xmm0..xmm3: W[]
25# xmm4,xmm5: temps
26# xmm6: current round constant
Denys Vlasenko4923f742022-02-08 03:29:16 +010027# xmm7: all round constants
Denys Vlasenko39369ff2022-01-23 09:27:30 +010028# -64(%rsp): area for passing RCONST + W[] from vector to integer units
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +010029
Denys Vlasenko05fd13e2022-01-03 01:57:29 +010030 movl 80(%rdi), %eax # a = ctx->hash[0]
31 movl 84(%rdi), %ebx # b = ctx->hash[1]
32 movl 88(%rdi), %ecx # c = ctx->hash[2]
33 movl 92(%rdi), %edx # d = ctx->hash[3]
34 movl 96(%rdi), %ebp # e = ctx->hash[4]
35
Denys Vlasenkodda77e82022-02-11 14:53:26 +010036 movaps sha1const(%rip), %xmm7
37 pshufd $0x00, %xmm7, %xmm6
38
39 # Load W[] to xmm0..3, byteswapping on the fly.
40 #
41 # For iterations 0..15, we pass W[] in rsi,r8..r14
42 # for use in RD1As instead of spilling them to stack.
43 # We lose parallelized addition of RCONST, but LEA
44 # can do two additions at once, so it is probably a wash.
45 # (We use rsi instead of rN because this makes two
46 # LEAs in two first RD1As shorter by one byte).
47 movq 4*0(%rdi), %rsi
48 movq 4*2(%rdi), %r8
49 bswapq %rsi
50 bswapq %r8
51 rolq $32, %rsi # rsi = W[1]:W[0]
52 rolq $32, %r8 # r8 = W[3]:W[2]
53 movq %rsi, %xmm0
54 movq %r8, %xmm4
55 punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
56# movaps %xmm0, %xmm4 # add RCONST, spill to stack
57# paddd %xmm6, %xmm4
58# movups %xmm4, -64+16*0(%rsp)
59
60 movq 4*4(%rdi), %r9
61 movq 4*6(%rdi), %r10
62 bswapq %r9
63 bswapq %r10
64 rolq $32, %r9 # r9 = W[5]:W[4]
65 rolq $32, %r10 # r10 = W[7]:W[6]
66 movq %r9, %xmm1
67 movq %r10, %xmm4
68 punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
69
70 movq 4*8(%rdi), %r11
71 movq 4*10(%rdi), %r12
72 bswapq %r11
73 bswapq %r12
Denys Vlasenko1f272c02022-02-11 23:03:27 +010074 rolq $32, %r11 # r11 = W[9]:W[8]
75 rolq $32, %r12 # r12 = W[11]:W[10]
Denys Vlasenkodda77e82022-02-11 14:53:26 +010076 movq %r11, %xmm2
77 movq %r12, %xmm4
78 punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
79
80 movq 4*12(%rdi), %r13
81 movq 4*14(%rdi), %r14
82 bswapq %r13
83 bswapq %r14
Denys Vlasenko1f272c02022-02-11 23:03:27 +010084 rolq $32, %r13 # r13 = W[13]:W[12]
85 rolq $32, %r14 # r14 = W[15]:W[14]
Denys Vlasenkodda77e82022-02-11 14:53:26 +010086 movq %r13, %xmm3
87 movq %r14, %xmm4
88 punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
89
Denys Vlasenko05fd13e2022-01-03 01:57:29 +010090# 0
Denys Vlasenkodda77e82022-02-11 14:53:26 +010091 leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
Denys Vlasenko205042c2022-01-25 17:00:57 +010092 shrq $32, %rsi
Denys Vlasenko05fd13e2022-01-03 01:57:29 +010093 movl %ecx, %edi # c
94 xorl %edx, %edi # ^d
95 andl %ebx, %edi # &b
96 xorl %edx, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +010097 addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +010098 movl %eax, %edi #
99 roll $5, %edi # rotl32(a,5)
100 addl %edi, %ebp # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100101 rorl $2, %ebx # b = rotl32(b,30)
102# 1
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100103 leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100104 movl %ebx, %edi # c
105 xorl %ecx, %edi # ^d
106 andl %eax, %edi # &b
107 xorl %ecx, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100108 addl %edi, %edx # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100109 movl %ebp, %edi #
110 roll $5, %edi # rotl32(a,5)
111 addl %edi, %edx # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100112 rorl $2, %eax # b = rotl32(b,30)
113# 2
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100114 leal 0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n]
Denys Vlasenko205042c2022-01-25 17:00:57 +0100115 shrq $32, %r8
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100116 movl %eax, %edi # c
117 xorl %ebx, %edi # ^d
118 andl %ebp, %edi # &b
119 xorl %ebx, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100120 addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100121 movl %edx, %edi #
122 roll $5, %edi # rotl32(a,5)
123 addl %edi, %ecx # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100124 rorl $2, %ebp # b = rotl32(b,30)
125# 3
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100126 leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100127 movl %ebp, %edi # c
128 xorl %eax, %edi # ^d
129 andl %edx, %edi # &b
130 xorl %eax, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100131 addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100132 movl %ecx, %edi #
133 roll $5, %edi # rotl32(a,5)
134 addl %edi, %ebx # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100135 rorl $2, %edx # b = rotl32(b,30)
136# 4
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100137 leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
Denys Vlasenko205042c2022-01-25 17:00:57 +0100138 shrq $32, %r9
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100139 movl %edx, %edi # c
140 xorl %ebp, %edi # ^d
141 andl %ecx, %edi # &b
142 xorl %ebp, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100143 addl %edi, %eax # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100144 movl %ebx, %edi #
145 roll $5, %edi # rotl32(a,5)
146 addl %edi, %eax # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100147 rorl $2, %ecx # b = rotl32(b,30)
148# 5
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100149 leal 0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100150 movl %ecx, %edi # c
151 xorl %edx, %edi # ^d
152 andl %ebx, %edi # &b
153 xorl %edx, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100154 addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100155 movl %eax, %edi #
156 roll $5, %edi # rotl32(a,5)
157 addl %edi, %ebp # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100158 rorl $2, %ebx # b = rotl32(b,30)
159# 6
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100160 leal 0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n]
Denys Vlasenko205042c2022-01-25 17:00:57 +0100161 shrq $32, %r10
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100162 movl %ebx, %edi # c
163 xorl %ecx, %edi # ^d
164 andl %eax, %edi # &b
165 xorl %ecx, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100166 addl %edi, %edx # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100167 movl %ebp, %edi #
168 roll $5, %edi # rotl32(a,5)
169 addl %edi, %edx # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100170 rorl $2, %eax # b = rotl32(b,30)
171# 7
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100172 leal 0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100173 movl %eax, %edi # c
174 xorl %ebx, %edi # ^d
175 andl %ebp, %edi # &b
176 xorl %ebx, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100177 addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100178 movl %edx, %edi #
179 roll $5, %edi # rotl32(a,5)
180 addl %edi, %ecx # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100181 rorl $2, %ebp # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100182# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
183 movaps %xmm3, %xmm4
184 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100185# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
186# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
187# same result as above, but shorter and faster:
188# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
189# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
190 movaps %xmm0, %xmm5
191 shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100192 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
193 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
194 xorps %xmm5, %xmm0 # ^
195 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
196 movaps %xmm0, %xmm5
197 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +0100198 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
199 paddd %xmm0, %xmm0 # shift left by 1
200 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100201 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
202 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
203 movaps %xmm5, %xmm4
204 pslld $2, %xmm5
205 psrld $30, %xmm4
206# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
207 xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
208 xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
209 movaps %xmm0, %xmm5
210 paddd %xmm6, %xmm5
211 movups %xmm5, -64+16*0(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100212# 8
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100213 leal 0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n]
Denys Vlasenko205042c2022-01-25 17:00:57 +0100214 shrq $32, %r11
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100215 movl %ebp, %edi # c
216 xorl %eax, %edi # ^d
217 andl %edx, %edi # &b
218 xorl %eax, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100219 addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100220 movl %ecx, %edi #
221 roll $5, %edi # rotl32(a,5)
222 addl %edi, %ebx # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100223 rorl $2, %edx # b = rotl32(b,30)
224# 9
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100225 leal 0x5A827999(%rax,%r11), %eax # e += RCONST + W[n]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100226 movl %edx, %edi # c
227 xorl %ebp, %edi # ^d
228 andl %ecx, %edi # &b
229 xorl %ebp, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100230 addl %edi, %eax # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100231 movl %ebx, %edi #
232 roll $5, %edi # rotl32(a,5)
233 addl %edi, %eax # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100234 rorl $2, %ecx # b = rotl32(b,30)
235# 10
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100236 leal 0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n]
Denys Vlasenko205042c2022-01-25 17:00:57 +0100237 shrq $32, %r12
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100238 movl %ecx, %edi # c
239 xorl %edx, %edi # ^d
240 andl %ebx, %edi # &b
241 xorl %edx, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100242 addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100243 movl %eax, %edi #
244 roll $5, %edi # rotl32(a,5)
245 addl %edi, %ebp # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100246 rorl $2, %ebx # b = rotl32(b,30)
247# 11
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100248 leal 0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100249 movl %ebx, %edi # c
250 xorl %ecx, %edi # ^d
251 andl %eax, %edi # &b
252 xorl %ecx, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100253 addl %edi, %edx # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100254 movl %ebp, %edi #
255 roll $5, %edi # rotl32(a,5)
256 addl %edi, %edx # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100257 rorl $2, %eax # b = rotl32(b,30)
Denys Vlasenko4923f742022-02-08 03:29:16 +0100258 pshufd $0x55, %xmm7, %xmm6
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100259# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
260 movaps %xmm0, %xmm4
261 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100262# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
263# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
264# same result as above, but shorter and faster:
265# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
266# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
267 movaps %xmm1, %xmm5
268 shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100269 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
270 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
271 xorps %xmm5, %xmm1 # ^
272 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
273 movaps %xmm1, %xmm5
274 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +0100275 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
276 paddd %xmm1, %xmm1 # shift left by 1
277 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100278 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
279 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
280 movaps %xmm5, %xmm4
281 pslld $2, %xmm5
282 psrld $30, %xmm4
283# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
284 xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
285 xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
286 movaps %xmm1, %xmm5
287 paddd %xmm6, %xmm5
288 movups %xmm5, -64+16*1(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100289# 12
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100290 leal 0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n]
Denys Vlasenko205042c2022-01-25 17:00:57 +0100291 shrq $32, %r13
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100292 movl %eax, %edi # c
293 xorl %ebx, %edi # ^d
294 andl %ebp, %edi # &b
295 xorl %ebx, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100296 addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100297 movl %edx, %edi #
298 roll $5, %edi # rotl32(a,5)
299 addl %edi, %ecx # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100300 rorl $2, %ebp # b = rotl32(b,30)
301# 13
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100302 leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100303 movl %ebp, %edi # c
304 xorl %eax, %edi # ^d
305 andl %edx, %edi # &b
306 xorl %eax, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100307 addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100308 movl %ecx, %edi #
309 roll $5, %edi # rotl32(a,5)
310 addl %edi, %ebx # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100311 rorl $2, %edx # b = rotl32(b,30)
312# 14
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100313 leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
Denys Vlasenko205042c2022-01-25 17:00:57 +0100314 shrq $32, %r14
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100315 movl %edx, %edi # c
316 xorl %ebp, %edi # ^d
317 andl %ecx, %edi # &b
318 xorl %ebp, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100319 addl %edi, %eax # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100320 movl %ebx, %edi #
321 roll $5, %edi # rotl32(a,5)
322 addl %edi, %eax # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100323 rorl $2, %ecx # b = rotl32(b,30)
324# 15
Denys Vlasenkodda77e82022-02-11 14:53:26 +0100325 leal 0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100326 movl %ecx, %edi # c
327 xorl %edx, %edi # ^d
328 andl %ebx, %edi # &b
329 xorl %edx, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100330 addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
Denys Vlasenko205042c2022-01-25 17:00:57 +0100331 movl %eax, %edi #
332 roll $5, %edi # rotl32(a,5)
333 addl %edi, %ebp # e += rotl32(a,5)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100334 rorl $2, %ebx # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100335# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
336 movaps %xmm1, %xmm4
337 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100338# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
339# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
340# same result as above, but shorter and faster:
341# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
342# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
343 movaps %xmm2, %xmm5
344 shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100345 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
346 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
347 xorps %xmm5, %xmm2 # ^
348 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
349 movaps %xmm2, %xmm5
350 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +0100351 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
352 paddd %xmm2, %xmm2 # shift left by 1
353 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100354 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
355 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
356 movaps %xmm5, %xmm4
357 pslld $2, %xmm5
358 psrld $30, %xmm4
359# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
360 xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
361 xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
362 movaps %xmm2, %xmm5
363 paddd %xmm6, %xmm5
364 movups %xmm5, -64+16*2(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100365# 16
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100366 movl %ebx, %edi # c
367 xorl %ecx, %edi # ^d
368 andl %eax, %edi # &b
369 xorl %ecx, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100370 addl -64+4*0(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100371 addl %edi, %edx # e += (((c ^ d) & b) ^ d)
372 movl %ebp, %esi #
373 roll $5, %esi # rotl32(a,5)
374 addl %esi, %edx # e += rotl32(a,5)
375 rorl $2, %eax # b = rotl32(b,30)
376# 17
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100377 movl %eax, %edi # c
378 xorl %ebx, %edi # ^d
379 andl %ebp, %edi # &b
380 xorl %ebx, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100381 addl -64+4*1(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100382 addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
383 movl %edx, %esi #
384 roll $5, %esi # rotl32(a,5)
385 addl %esi, %ecx # e += rotl32(a,5)
386 rorl $2, %ebp # b = rotl32(b,30)
387# 18
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100388 movl %ebp, %edi # c
389 xorl %eax, %edi # ^d
390 andl %edx, %edi # &b
391 xorl %eax, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100392 addl -64+4*2(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100393 addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
394 movl %ecx, %esi #
395 roll $5, %esi # rotl32(a,5)
396 addl %esi, %ebx # e += rotl32(a,5)
397 rorl $2, %edx # b = rotl32(b,30)
398# 19
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100399 movl %edx, %edi # c
400 xorl %ebp, %edi # ^d
401 andl %ecx, %edi # &b
402 xorl %ebp, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100403 addl -64+4*3(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100404 addl %edi, %eax # e += (((c ^ d) & b) ^ d)
405 movl %ebx, %esi #
406 roll $5, %esi # rotl32(a,5)
407 addl %esi, %eax # e += rotl32(a,5)
408 rorl $2, %ecx # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100409# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
410 movaps %xmm2, %xmm4
411 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100412# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
413# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
414# same result as above, but shorter and faster:
415# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
416# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
417 movaps %xmm3, %xmm5
418 shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100419 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
420 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
421 xorps %xmm5, %xmm3 # ^
422 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
423 movaps %xmm3, %xmm5
424 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +0100425 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
426 paddd %xmm3, %xmm3 # shift left by 1
427 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100428 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
429 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
430 movaps %xmm5, %xmm4
431 pslld $2, %xmm5
432 psrld $30, %xmm4
433# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
434 xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
435 xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
436 movaps %xmm3, %xmm5
437 paddd %xmm6, %xmm5
438 movups %xmm5, -64+16*3(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100439# 20
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100440 movl %ecx, %edi # c
441 xorl %edx, %edi # ^d
442 xorl %ebx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100443 addl -64+4*4(%rsp), %ebp # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100444 addl %edi, %ebp # e += (c ^ d ^ b)
445 movl %eax, %esi #
446 roll $5, %esi # rotl32(a,5)
447 addl %esi, %ebp # e += rotl32(a,5)
448 rorl $2, %ebx # b = rotl32(b,30)
449# 21
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100450 movl %ebx, %edi # c
451 xorl %ecx, %edi # ^d
452 xorl %eax, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100453 addl -64+4*5(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100454 addl %edi, %edx # e += (c ^ d ^ b)
455 movl %ebp, %esi #
456 roll $5, %esi # rotl32(a,5)
457 addl %esi, %edx # e += rotl32(a,5)
458 rorl $2, %eax # b = rotl32(b,30)
459# 22
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100460 movl %eax, %edi # c
461 xorl %ebx, %edi # ^d
462 xorl %ebp, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100463 addl -64+4*6(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100464 addl %edi, %ecx # e += (c ^ d ^ b)
465 movl %edx, %esi #
466 roll $5, %esi # rotl32(a,5)
467 addl %esi, %ecx # e += rotl32(a,5)
468 rorl $2, %ebp # b = rotl32(b,30)
469# 23
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100470 movl %ebp, %edi # c
471 xorl %eax, %edi # ^d
472 xorl %edx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100473 addl -64+4*7(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100474 addl %edi, %ebx # e += (c ^ d ^ b)
475 movl %ecx, %esi #
476 roll $5, %esi # rotl32(a,5)
477 addl %esi, %ebx # e += rotl32(a,5)
478 rorl $2, %edx # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100479# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
480 movaps %xmm3, %xmm4
481 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100482# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
483# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
484# same result as above, but shorter and faster:
485# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
486# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
487 movaps %xmm0, %xmm5
488 shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100489 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
490 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
491 xorps %xmm5, %xmm0 # ^
492 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
493 movaps %xmm0, %xmm5
494 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +0100495 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
496 paddd %xmm0, %xmm0 # shift left by 1
497 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100498 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
499 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
500 movaps %xmm5, %xmm4
501 pslld $2, %xmm5
502 psrld $30, %xmm4
503# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
504 xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
505 xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
506 movaps %xmm0, %xmm5
507 paddd %xmm6, %xmm5
508 movups %xmm5, -64+16*0(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100509# 24
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100510 movl %edx, %edi # c
511 xorl %ebp, %edi # ^d
512 xorl %ecx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100513 addl -64+4*8(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100514 addl %edi, %eax # e += (c ^ d ^ b)
515 movl %ebx, %esi #
516 roll $5, %esi # rotl32(a,5)
517 addl %esi, %eax # e += rotl32(a,5)
518 rorl $2, %ecx # b = rotl32(b,30)
519# 25
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100520 movl %ecx, %edi # c
521 xorl %edx, %edi # ^d
522 xorl %ebx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100523 addl -64+4*9(%rsp), %ebp # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100524 addl %edi, %ebp # e += (c ^ d ^ b)
525 movl %eax, %esi #
526 roll $5, %esi # rotl32(a,5)
527 addl %esi, %ebp # e += rotl32(a,5)
528 rorl $2, %ebx # b = rotl32(b,30)
529# 26
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100530 movl %ebx, %edi # c
531 xorl %ecx, %edi # ^d
532 xorl %eax, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100533 addl -64+4*10(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100534 addl %edi, %edx # e += (c ^ d ^ b)
535 movl %ebp, %esi #
536 roll $5, %esi # rotl32(a,5)
537 addl %esi, %edx # e += rotl32(a,5)
538 rorl $2, %eax # b = rotl32(b,30)
539# 27
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100540 movl %eax, %edi # c
541 xorl %ebx, %edi # ^d
542 xorl %ebp, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100543 addl -64+4*11(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100544 addl %edi, %ecx # e += (c ^ d ^ b)
545 movl %edx, %esi #
546 roll $5, %esi # rotl32(a,5)
547 addl %esi, %ecx # e += rotl32(a,5)
548 rorl $2, %ebp # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100549# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
550 movaps %xmm0, %xmm4
551 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100552# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
553# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
554# same result as above, but shorter and faster:
555# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
556# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
557 movaps %xmm1, %xmm5
558 shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100559 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
560 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
561 xorps %xmm5, %xmm1 # ^
562 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
563 movaps %xmm1, %xmm5
564 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +0100565 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
566 paddd %xmm1, %xmm1 # shift left by 1
567 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100568 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
569 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
570 movaps %xmm5, %xmm4
571 pslld $2, %xmm5
572 psrld $30, %xmm4
573# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
574 xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
575 xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
576 movaps %xmm1, %xmm5
577 paddd %xmm6, %xmm5
578 movups %xmm5, -64+16*1(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100579# 28
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100580 movl %ebp, %edi # c
581 xorl %eax, %edi # ^d
582 xorl %edx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100583 addl -64+4*12(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100584 addl %edi, %ebx # e += (c ^ d ^ b)
585 movl %ecx, %esi #
586 roll $5, %esi # rotl32(a,5)
587 addl %esi, %ebx # e += rotl32(a,5)
588 rorl $2, %edx # b = rotl32(b,30)
589# 29
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100590 movl %edx, %edi # c
591 xorl %ebp, %edi # ^d
592 xorl %ecx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100593 addl -64+4*13(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100594 addl %edi, %eax # e += (c ^ d ^ b)
595 movl %ebx, %esi #
596 roll $5, %esi # rotl32(a,5)
597 addl %esi, %eax # e += rotl32(a,5)
598 rorl $2, %ecx # b = rotl32(b,30)
599# 30
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100600 movl %ecx, %edi # c
601 xorl %edx, %edi # ^d
602 xorl %ebx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100603 addl -64+4*14(%rsp), %ebp # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100604 addl %edi, %ebp # e += (c ^ d ^ b)
605 movl %eax, %esi #
606 roll $5, %esi # rotl32(a,5)
607 addl %esi, %ebp # e += rotl32(a,5)
608 rorl $2, %ebx # b = rotl32(b,30)
609# 31
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100610 movl %ebx, %edi # c
611 xorl %ecx, %edi # ^d
612 xorl %eax, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100613 addl -64+4*15(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100614 addl %edi, %edx # e += (c ^ d ^ b)
615 movl %ebp, %esi #
616 roll $5, %esi # rotl32(a,5)
617 addl %esi, %edx # e += rotl32(a,5)
618 rorl $2, %eax # b = rotl32(b,30)
Denys Vlasenko4923f742022-02-08 03:29:16 +0100619 pshufd $0xaa, %xmm7, %xmm6
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100620# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
621 movaps %xmm1, %xmm4
622 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100623# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
624# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
625# same result as above, but shorter and faster:
626# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
627# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
628 movaps %xmm2, %xmm5
629 shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100630 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
631 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
632 xorps %xmm5, %xmm2 # ^
633 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
634 movaps %xmm2, %xmm5
635 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +0100636 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
637 paddd %xmm2, %xmm2 # shift left by 1
638 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100639 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
640 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
641 movaps %xmm5, %xmm4
642 pslld $2, %xmm5
643 psrld $30, %xmm4
644# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
645 xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
646 xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
647 movaps %xmm2, %xmm5
648 paddd %xmm6, %xmm5
649 movups %xmm5, -64+16*2(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100650# 32
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100651 movl %eax, %edi # c
652 xorl %ebx, %edi # ^d
653 xorl %ebp, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100654 addl -64+4*0(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100655 addl %edi, %ecx # e += (c ^ d ^ b)
656 movl %edx, %esi #
657 roll $5, %esi # rotl32(a,5)
658 addl %esi, %ecx # e += rotl32(a,5)
659 rorl $2, %ebp # b = rotl32(b,30)
660# 33
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100661 movl %ebp, %edi # c
662 xorl %eax, %edi # ^d
663 xorl %edx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100664 addl -64+4*1(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100665 addl %edi, %ebx # e += (c ^ d ^ b)
666 movl %ecx, %esi #
667 roll $5, %esi # rotl32(a,5)
668 addl %esi, %ebx # e += rotl32(a,5)
669 rorl $2, %edx # b = rotl32(b,30)
670# 34
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100671 movl %edx, %edi # c
672 xorl %ebp, %edi # ^d
673 xorl %ecx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100674 addl -64+4*2(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100675 addl %edi, %eax # e += (c ^ d ^ b)
676 movl %ebx, %esi #
677 roll $5, %esi # rotl32(a,5)
678 addl %esi, %eax # e += rotl32(a,5)
679 rorl $2, %ecx # b = rotl32(b,30)
680# 35
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100681 movl %ecx, %edi # c
682 xorl %edx, %edi # ^d
683 xorl %ebx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100684 addl -64+4*3(%rsp), %ebp # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100685 addl %edi, %ebp # e += (c ^ d ^ b)
686 movl %eax, %esi #
687 roll $5, %esi # rotl32(a,5)
688 addl %esi, %ebp # e += rotl32(a,5)
689 rorl $2, %ebx # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100690# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
691 movaps %xmm2, %xmm4
692 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100693# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
694# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
695# same result as above, but shorter and faster:
696# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
697# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
698 movaps %xmm3, %xmm5
699 shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100700 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
701 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
702 xorps %xmm5, %xmm3 # ^
703 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
704 movaps %xmm3, %xmm5
705 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +0100706 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
707 paddd %xmm3, %xmm3 # shift left by 1
708 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100709 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
710 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
711 movaps %xmm5, %xmm4
712 pslld $2, %xmm5
713 psrld $30, %xmm4
714# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
715 xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
716 xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
717 movaps %xmm3, %xmm5
718 paddd %xmm6, %xmm5
719 movups %xmm5, -64+16*3(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100720# 36
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100721 movl %ebx, %edi # c
722 xorl %ecx, %edi # ^d
723 xorl %eax, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100724 addl -64+4*4(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100725 addl %edi, %edx # e += (c ^ d ^ b)
726 movl %ebp, %esi #
727 roll $5, %esi # rotl32(a,5)
728 addl %esi, %edx # e += rotl32(a,5)
729 rorl $2, %eax # b = rotl32(b,30)
730# 37
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100731 movl %eax, %edi # c
732 xorl %ebx, %edi # ^d
733 xorl %ebp, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100734 addl -64+4*5(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100735 addl %edi, %ecx # e += (c ^ d ^ b)
736 movl %edx, %esi #
737 roll $5, %esi # rotl32(a,5)
738 addl %esi, %ecx # e += rotl32(a,5)
739 rorl $2, %ebp # b = rotl32(b,30)
740# 38
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100741 movl %ebp, %edi # c
742 xorl %eax, %edi # ^d
743 xorl %edx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100744 addl -64+4*6(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100745 addl %edi, %ebx # e += (c ^ d ^ b)
746 movl %ecx, %esi #
747 roll $5, %esi # rotl32(a,5)
748 addl %esi, %ebx # e += rotl32(a,5)
749 rorl $2, %edx # b = rotl32(b,30)
750# 39
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100751 movl %edx, %edi # c
752 xorl %ebp, %edi # ^d
753 xorl %ecx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100754 addl -64+4*7(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100755 addl %edi, %eax # e += (c ^ d ^ b)
756 movl %ebx, %esi #
757 roll $5, %esi # rotl32(a,5)
758 addl %esi, %eax # e += rotl32(a,5)
759 rorl $2, %ecx # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100760# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
761 movaps %xmm3, %xmm4
762 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100763# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
764# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
765# same result as above, but shorter and faster:
766# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
767# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
768 movaps %xmm0, %xmm5
769 shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100770 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
771 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
772 xorps %xmm5, %xmm0 # ^
773 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
774 movaps %xmm0, %xmm5
775 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +0100776 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
777 paddd %xmm0, %xmm0 # shift left by 1
778 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100779 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
780 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
781 movaps %xmm5, %xmm4
782 pslld $2, %xmm5
783 psrld $30, %xmm4
784# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
785 xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
786 xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
787 movaps %xmm0, %xmm5
788 paddd %xmm6, %xmm5
789 movups %xmm5, -64+16*0(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100790# 40
791 movl %ebx, %edi # di: b
792 movl %ebx, %esi # si: b
793 orl %ecx, %edi # di: b | c
794 andl %ecx, %esi # si: b & c
795 andl %edx, %edi # di: (b | c) & d
796 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100797 addl %edi, %ebp # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100798 addl -64+4*8(%rsp), %ebp # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100799 movl %eax, %esi #
800 roll $5, %esi # rotl32(a,5)
801 addl %esi, %ebp # e += rotl32(a,5)
802 rorl $2, %ebx # b = rotl32(b,30)
803# 41
804 movl %eax, %edi # di: b
805 movl %eax, %esi # si: b
806 orl %ebx, %edi # di: b | c
807 andl %ebx, %esi # si: b & c
808 andl %ecx, %edi # di: (b | c) & d
809 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100810 addl %edi, %edx # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100811 addl -64+4*9(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100812 movl %ebp, %esi #
813 roll $5, %esi # rotl32(a,5)
814 addl %esi, %edx # e += rotl32(a,5)
815 rorl $2, %eax # b = rotl32(b,30)
816# 42
817 movl %ebp, %edi # di: b
818 movl %ebp, %esi # si: b
819 orl %eax, %edi # di: b | c
820 andl %eax, %esi # si: b & c
821 andl %ebx, %edi # di: (b | c) & d
822 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100823 addl %edi, %ecx # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100824 addl -64+4*10(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100825 movl %edx, %esi #
826 roll $5, %esi # rotl32(a,5)
827 addl %esi, %ecx # e += rotl32(a,5)
828 rorl $2, %ebp # b = rotl32(b,30)
829# 43
830 movl %edx, %edi # di: b
831 movl %edx, %esi # si: b
832 orl %ebp, %edi # di: b | c
833 andl %ebp, %esi # si: b & c
834 andl %eax, %edi # di: (b | c) & d
835 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100836 addl %edi, %ebx # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100837 addl -64+4*11(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100838 movl %ecx, %esi #
839 roll $5, %esi # rotl32(a,5)
840 addl %esi, %ebx # e += rotl32(a,5)
841 rorl $2, %edx # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100842# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
843 movaps %xmm0, %xmm4
844 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100845# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
846# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
847# same result as above, but shorter and faster:
848# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
849# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
850 movaps %xmm1, %xmm5
851 shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100852 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
853 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
854 xorps %xmm5, %xmm1 # ^
855 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
856 movaps %xmm1, %xmm5
857 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +0100858 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
859 paddd %xmm1, %xmm1 # shift left by 1
860 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100861 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
862 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
863 movaps %xmm5, %xmm4
864 pslld $2, %xmm5
865 psrld $30, %xmm4
866# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
867 xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
868 xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
869 movaps %xmm1, %xmm5
870 paddd %xmm6, %xmm5
871 movups %xmm5, -64+16*1(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100872# 44
873 movl %ecx, %edi # di: b
874 movl %ecx, %esi # si: b
875 orl %edx, %edi # di: b | c
876 andl %edx, %esi # si: b & c
877 andl %ebp, %edi # di: (b | c) & d
878 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100879 addl %edi, %eax # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100880 addl -64+4*12(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100881 movl %ebx, %esi #
882 roll $5, %esi # rotl32(a,5)
883 addl %esi, %eax # e += rotl32(a,5)
884 rorl $2, %ecx # b = rotl32(b,30)
885# 45
886 movl %ebx, %edi # di: b
887 movl %ebx, %esi # si: b
888 orl %ecx, %edi # di: b | c
889 andl %ecx, %esi # si: b & c
890 andl %edx, %edi # di: (b | c) & d
891 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100892 addl %edi, %ebp # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100893 addl -64+4*13(%rsp), %ebp # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100894 movl %eax, %esi #
895 roll $5, %esi # rotl32(a,5)
896 addl %esi, %ebp # e += rotl32(a,5)
897 rorl $2, %ebx # b = rotl32(b,30)
898# 46
899 movl %eax, %edi # di: b
900 movl %eax, %esi # si: b
901 orl %ebx, %edi # di: b | c
902 andl %ebx, %esi # si: b & c
903 andl %ecx, %edi # di: (b | c) & d
904 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100905 addl %edi, %edx # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100906 addl -64+4*14(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100907 movl %ebp, %esi #
908 roll $5, %esi # rotl32(a,5)
909 addl %esi, %edx # e += rotl32(a,5)
910 rorl $2, %eax # b = rotl32(b,30)
911# 47
912 movl %ebp, %edi # di: b
913 movl %ebp, %esi # si: b
914 orl %eax, %edi # di: b | c
915 andl %eax, %esi # si: b & c
916 andl %ebx, %edi # di: (b | c) & d
917 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100918 addl %edi, %ecx # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100919 addl -64+4*15(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100920 movl %edx, %esi #
921 roll $5, %esi # rotl32(a,5)
922 addl %esi, %ecx # e += rotl32(a,5)
923 rorl $2, %ebp # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100924# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
925 movaps %xmm1, %xmm4
926 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100927# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
928# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
929# same result as above, but shorter and faster:
930# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
931# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
932 movaps %xmm2, %xmm5
933 shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100934 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
935 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
936 xorps %xmm5, %xmm2 # ^
937 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
938 movaps %xmm2, %xmm5
939 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +0100940 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
941 paddd %xmm2, %xmm2 # shift left by 1
942 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100943 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
944 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
945 movaps %xmm5, %xmm4
946 pslld $2, %xmm5
947 psrld $30, %xmm4
948# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
949 xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
950 xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
951 movaps %xmm2, %xmm5
952 paddd %xmm6, %xmm5
953 movups %xmm5, -64+16*2(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100954# 48
955 movl %edx, %edi # di: b
956 movl %edx, %esi # si: b
957 orl %ebp, %edi # di: b | c
958 andl %ebp, %esi # si: b & c
959 andl %eax, %edi # di: (b | c) & d
960 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100961 addl %edi, %ebx # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100962 addl -64+4*0(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100963 movl %ecx, %esi #
964 roll $5, %esi # rotl32(a,5)
965 addl %esi, %ebx # e += rotl32(a,5)
966 rorl $2, %edx # b = rotl32(b,30)
967# 49
968 movl %ecx, %edi # di: b
969 movl %ecx, %esi # si: b
970 orl %edx, %edi # di: b | c
971 andl %edx, %esi # si: b & c
972 andl %ebp, %edi # di: (b | c) & d
973 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100974 addl %edi, %eax # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100975 addl -64+4*1(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100976 movl %ebx, %esi #
977 roll $5, %esi # rotl32(a,5)
978 addl %esi, %eax # e += rotl32(a,5)
979 rorl $2, %ecx # b = rotl32(b,30)
980# 50
981 movl %ebx, %edi # di: b
982 movl %ebx, %esi # si: b
983 orl %ecx, %edi # di: b | c
984 andl %ecx, %esi # si: b & c
985 andl %edx, %edi # di: (b | c) & d
986 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100987 addl %edi, %ebp # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +0100988 addl -64+4*2(%rsp), %ebp # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100989 movl %eax, %esi #
990 roll $5, %esi # rotl32(a,5)
991 addl %esi, %ebp # e += rotl32(a,5)
992 rorl $2, %ebx # b = rotl32(b,30)
993# 51
994 movl %eax, %edi # di: b
995 movl %eax, %esi # si: b
996 orl %ebx, %edi # di: b | c
997 andl %ebx, %esi # si: b & c
998 andl %ecx, %edi # di: (b | c) & d
999 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001000 addl %edi, %edx # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001001 addl -64+4*3(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001002 movl %ebp, %esi #
1003 roll $5, %esi # rotl32(a,5)
1004 addl %esi, %edx # e += rotl32(a,5)
1005 rorl $2, %eax # b = rotl32(b,30)
Denys Vlasenko4923f742022-02-08 03:29:16 +01001006 pshufd $0xff, %xmm7, %xmm6
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001007# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
1008 movaps %xmm2, %xmm4
1009 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +01001010# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1011# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1012# same result as above, but shorter and faster:
1013# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1014# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1015 movaps %xmm3, %xmm5
1016 shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001017 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1018 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1019 xorps %xmm5, %xmm3 # ^
1020 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1021 movaps %xmm3, %xmm5
1022 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +01001023 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1024 paddd %xmm3, %xmm3 # shift left by 1
1025 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001026 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1027 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1028 movaps %xmm5, %xmm4
1029 pslld $2, %xmm5
1030 psrld $30, %xmm4
1031# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1032 xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
1033 xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1034 movaps %xmm3, %xmm5
1035 paddd %xmm6, %xmm5
1036 movups %xmm5, -64+16*3(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001037# 52
1038 movl %ebp, %edi # di: b
1039 movl %ebp, %esi # si: b
1040 orl %eax, %edi # di: b | c
1041 andl %eax, %esi # si: b & c
1042 andl %ebx, %edi # di: (b | c) & d
1043 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001044 addl %edi, %ecx # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001045 addl -64+4*4(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001046 movl %edx, %esi #
1047 roll $5, %esi # rotl32(a,5)
1048 addl %esi, %ecx # e += rotl32(a,5)
1049 rorl $2, %ebp # b = rotl32(b,30)
1050# 53
1051 movl %edx, %edi # di: b
1052 movl %edx, %esi # si: b
1053 orl %ebp, %edi # di: b | c
1054 andl %ebp, %esi # si: b & c
1055 andl %eax, %edi # di: (b | c) & d
1056 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001057 addl %edi, %ebx # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001058 addl -64+4*5(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001059 movl %ecx, %esi #
1060 roll $5, %esi # rotl32(a,5)
1061 addl %esi, %ebx # e += rotl32(a,5)
1062 rorl $2, %edx # b = rotl32(b,30)
1063# 54
1064 movl %ecx, %edi # di: b
1065 movl %ecx, %esi # si: b
1066 orl %edx, %edi # di: b | c
1067 andl %edx, %esi # si: b & c
1068 andl %ebp, %edi # di: (b | c) & d
1069 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001070 addl %edi, %eax # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001071 addl -64+4*6(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001072 movl %ebx, %esi #
1073 roll $5, %esi # rotl32(a,5)
1074 addl %esi, %eax # e += rotl32(a,5)
1075 rorl $2, %ecx # b = rotl32(b,30)
1076# 55
1077 movl %ebx, %edi # di: b
1078 movl %ebx, %esi # si: b
1079 orl %ecx, %edi # di: b | c
1080 andl %ecx, %esi # si: b & c
1081 andl %edx, %edi # di: (b | c) & d
1082 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001083 addl %edi, %ebp # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001084 addl -64+4*7(%rsp), %ebp # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001085 movl %eax, %esi #
1086 roll $5, %esi # rotl32(a,5)
1087 addl %esi, %ebp # e += rotl32(a,5)
1088 rorl $2, %ebx # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001089# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
1090 movaps %xmm3, %xmm4
1091 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +01001092# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1093# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1094# same result as above, but shorter and faster:
1095# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1096# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1097 movaps %xmm0, %xmm5
1098 shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001099 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1100 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1101 xorps %xmm5, %xmm0 # ^
1102 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1103 movaps %xmm0, %xmm5
1104 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +01001105 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1106 paddd %xmm0, %xmm0 # shift left by 1
1107 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001108 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1109 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1110 movaps %xmm5, %xmm4
1111 pslld $2, %xmm5
1112 psrld $30, %xmm4
1113# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1114 xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
1115 xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1116 movaps %xmm0, %xmm5
1117 paddd %xmm6, %xmm5
1118 movups %xmm5, -64+16*0(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001119# 56
1120 movl %eax, %edi # di: b
1121 movl %eax, %esi # si: b
1122 orl %ebx, %edi # di: b | c
1123 andl %ebx, %esi # si: b & c
1124 andl %ecx, %edi # di: (b | c) & d
1125 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001126 addl %edi, %edx # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001127 addl -64+4*8(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001128 movl %ebp, %esi #
1129 roll $5, %esi # rotl32(a,5)
1130 addl %esi, %edx # e += rotl32(a,5)
1131 rorl $2, %eax # b = rotl32(b,30)
1132# 57
1133 movl %ebp, %edi # di: b
1134 movl %ebp, %esi # si: b
1135 orl %eax, %edi # di: b | c
1136 andl %eax, %esi # si: b & c
1137 andl %ebx, %edi # di: (b | c) & d
1138 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001139 addl %edi, %ecx # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001140 addl -64+4*9(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001141 movl %edx, %esi #
1142 roll $5, %esi # rotl32(a,5)
1143 addl %esi, %ecx # e += rotl32(a,5)
1144 rorl $2, %ebp # b = rotl32(b,30)
1145# 58
1146 movl %edx, %edi # di: b
1147 movl %edx, %esi # si: b
1148 orl %ebp, %edi # di: b | c
1149 andl %ebp, %esi # si: b & c
1150 andl %eax, %edi # di: (b | c) & d
1151 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001152 addl %edi, %ebx # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001153 addl -64+4*10(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001154 movl %ecx, %esi #
1155 roll $5, %esi # rotl32(a,5)
1156 addl %esi, %ebx # e += rotl32(a,5)
1157 rorl $2, %edx # b = rotl32(b,30)
1158# 59
1159 movl %ecx, %edi # di: b
1160 movl %ecx, %esi # si: b
1161 orl %edx, %edi # di: b | c
1162 andl %edx, %esi # si: b & c
1163 andl %ebp, %edi # di: (b | c) & d
1164 orl %esi, %edi # ((b | c) & d) | (b & c)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001165 addl %edi, %eax # += ((b | c) & d) | (b & c)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001166 addl -64+4*11(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001167 movl %ebx, %esi #
1168 roll $5, %esi # rotl32(a,5)
1169 addl %esi, %eax # e += rotl32(a,5)
1170 rorl $2, %ecx # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001171# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
1172 movaps %xmm0, %xmm4
1173 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +01001174# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1175# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1176# same result as above, but shorter and faster:
1177# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1178# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1179 movaps %xmm1, %xmm5
1180 shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001181 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1182 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1183 xorps %xmm5, %xmm1 # ^
1184 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1185 movaps %xmm1, %xmm5
1186 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +01001187 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1188 paddd %xmm1, %xmm1 # shift left by 1
1189 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001190 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1191 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1192 movaps %xmm5, %xmm4
1193 pslld $2, %xmm5
1194 psrld $30, %xmm4
1195# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1196 xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
1197 xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1198 movaps %xmm1, %xmm5
1199 paddd %xmm6, %xmm5
1200 movups %xmm5, -64+16*1(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001201# 60
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001202 movl %ecx, %edi # c
1203 xorl %edx, %edi # ^d
1204 xorl %ebx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001205 addl -64+4*12(%rsp), %ebp # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001206 addl %edi, %ebp # e += (c ^ d ^ b)
1207 movl %eax, %esi #
1208 roll $5, %esi # rotl32(a,5)
1209 addl %esi, %ebp # e += rotl32(a,5)
1210 rorl $2, %ebx # b = rotl32(b,30)
1211# 61
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001212 movl %ebx, %edi # c
1213 xorl %ecx, %edi # ^d
1214 xorl %eax, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001215 addl -64+4*13(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001216 addl %edi, %edx # e += (c ^ d ^ b)
1217 movl %ebp, %esi #
1218 roll $5, %esi # rotl32(a,5)
1219 addl %esi, %edx # e += rotl32(a,5)
1220 rorl $2, %eax # b = rotl32(b,30)
1221# 62
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001222 movl %eax, %edi # c
1223 xorl %ebx, %edi # ^d
1224 xorl %ebp, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001225 addl -64+4*14(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001226 addl %edi, %ecx # e += (c ^ d ^ b)
1227 movl %edx, %esi #
1228 roll $5, %esi # rotl32(a,5)
1229 addl %esi, %ecx # e += rotl32(a,5)
1230 rorl $2, %ebp # b = rotl32(b,30)
1231# 63
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001232 movl %ebp, %edi # c
1233 xorl %eax, %edi # ^d
1234 xorl %edx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001235 addl -64+4*15(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001236 addl %edi, %ebx # e += (c ^ d ^ b)
1237 movl %ecx, %esi #
1238 roll $5, %esi # rotl32(a,5)
1239 addl %esi, %ebx # e += rotl32(a,5)
1240 rorl $2, %edx # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001241# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
1242 movaps %xmm1, %xmm4
1243 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +01001244# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1245# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1246# same result as above, but shorter and faster:
1247# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1248# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1249 movaps %xmm2, %xmm5
1250 shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001251 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1252 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1253 xorps %xmm5, %xmm2 # ^
1254 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1255 movaps %xmm2, %xmm5
1256 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +01001257 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1258 paddd %xmm2, %xmm2 # shift left by 1
1259 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001260 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1261 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1262 movaps %xmm5, %xmm4
1263 pslld $2, %xmm5
1264 psrld $30, %xmm4
1265# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1266 xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
1267 xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1268 movaps %xmm2, %xmm5
1269 paddd %xmm6, %xmm5
1270 movups %xmm5, -64+16*2(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001271# 64
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001272 movl %edx, %edi # c
1273 xorl %ebp, %edi # ^d
1274 xorl %ecx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001275 addl -64+4*0(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001276 addl %edi, %eax # e += (c ^ d ^ b)
1277 movl %ebx, %esi #
1278 roll $5, %esi # rotl32(a,5)
1279 addl %esi, %eax # e += rotl32(a,5)
1280 rorl $2, %ecx # b = rotl32(b,30)
1281# 65
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001282 movl %ecx, %edi # c
1283 xorl %edx, %edi # ^d
1284 xorl %ebx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001285 addl -64+4*1(%rsp), %ebp # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001286 addl %edi, %ebp # e += (c ^ d ^ b)
1287 movl %eax, %esi #
1288 roll $5, %esi # rotl32(a,5)
1289 addl %esi, %ebp # e += rotl32(a,5)
1290 rorl $2, %ebx # b = rotl32(b,30)
1291# 66
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001292 movl %ebx, %edi # c
1293 xorl %ecx, %edi # ^d
1294 xorl %eax, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001295 addl -64+4*2(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001296 addl %edi, %edx # e += (c ^ d ^ b)
1297 movl %ebp, %esi #
1298 roll $5, %esi # rotl32(a,5)
1299 addl %esi, %edx # e += rotl32(a,5)
1300 rorl $2, %eax # b = rotl32(b,30)
1301# 67
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001302 movl %eax, %edi # c
1303 xorl %ebx, %edi # ^d
1304 xorl %ebp, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001305 addl -64+4*3(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001306 addl %edi, %ecx # e += (c ^ d ^ b)
1307 movl %edx, %esi #
1308 roll $5, %esi # rotl32(a,5)
1309 addl %esi, %ecx # e += rotl32(a,5)
1310 rorl $2, %ebp # b = rotl32(b,30)
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001311# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
1312 movaps %xmm2, %xmm4
1313 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +01001314# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1315# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1316# same result as above, but shorter and faster:
1317# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1318# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1319 movaps %xmm3, %xmm5
1320 shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001321 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1322 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1323 xorps %xmm5, %xmm3 # ^
1324 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1325 movaps %xmm3, %xmm5
1326 xorps %xmm4, %xmm4 # rol(W0,1):
Denys Vlasenko205042c2022-01-25 17:00:57 +01001327 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1328 paddd %xmm3, %xmm3 # shift left by 1
1329 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001330 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1331 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1332 movaps %xmm5, %xmm4
1333 pslld $2, %xmm5
1334 psrld $30, %xmm4
1335# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1336 xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
1337 xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1338 movaps %xmm3, %xmm5
1339 paddd %xmm6, %xmm5
1340 movups %xmm5, -64+16*3(%rsp)
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001341# 68
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001342 movl %ebp, %edi # c
1343 xorl %eax, %edi # ^d
1344 xorl %edx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001345 addl -64+4*4(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001346 addl %edi, %ebx # e += (c ^ d ^ b)
1347 movl %ecx, %esi #
1348 roll $5, %esi # rotl32(a,5)
1349 addl %esi, %ebx # e += rotl32(a,5)
1350 rorl $2, %edx # b = rotl32(b,30)
1351# 69
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001352 movl %edx, %edi # c
1353 xorl %ebp, %edi # ^d
1354 xorl %ecx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001355 addl -64+4*5(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001356 addl %edi, %eax # e += (c ^ d ^ b)
1357 movl %ebx, %esi #
1358 roll $5, %esi # rotl32(a,5)
1359 addl %esi, %eax # e += rotl32(a,5)
1360 rorl $2, %ecx # b = rotl32(b,30)
1361# 70
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001362 movl %ecx, %edi # c
1363 xorl %edx, %edi # ^d
1364 xorl %ebx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001365 addl -64+4*6(%rsp), %ebp # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001366 addl %edi, %ebp # e += (c ^ d ^ b)
1367 movl %eax, %esi #
1368 roll $5, %esi # rotl32(a,5)
1369 addl %esi, %ebp # e += rotl32(a,5)
1370 rorl $2, %ebx # b = rotl32(b,30)
1371# 71
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001372 movl %ebx, %edi # c
1373 xorl %ecx, %edi # ^d
1374 xorl %eax, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001375 addl -64+4*7(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001376 addl %edi, %edx # e += (c ^ d ^ b)
1377 movl %ebp, %esi #
1378 roll $5, %esi # rotl32(a,5)
1379 addl %esi, %edx # e += rotl32(a,5)
1380 rorl $2, %eax # b = rotl32(b,30)
1381# 72
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001382 movl %eax, %edi # c
1383 xorl %ebx, %edi # ^d
1384 xorl %ebp, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001385 addl -64+4*8(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001386 addl %edi, %ecx # e += (c ^ d ^ b)
1387 movl %edx, %esi #
1388 roll $5, %esi # rotl32(a,5)
1389 addl %esi, %ecx # e += rotl32(a,5)
1390 rorl $2, %ebp # b = rotl32(b,30)
1391# 73
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001392 movl %ebp, %edi # c
1393 xorl %eax, %edi # ^d
1394 xorl %edx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001395 addl -64+4*9(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001396 addl %edi, %ebx # e += (c ^ d ^ b)
1397 movl %ecx, %esi #
1398 roll $5, %esi # rotl32(a,5)
1399 addl %esi, %ebx # e += rotl32(a,5)
1400 rorl $2, %edx # b = rotl32(b,30)
1401# 74
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001402 movl %edx, %edi # c
1403 xorl %ebp, %edi # ^d
1404 xorl %ecx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001405 addl -64+4*10(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001406 addl %edi, %eax # e += (c ^ d ^ b)
1407 movl %ebx, %esi #
1408 roll $5, %esi # rotl32(a,5)
1409 addl %esi, %eax # e += rotl32(a,5)
1410 rorl $2, %ecx # b = rotl32(b,30)
1411# 75
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001412 movl %ecx, %edi # c
1413 xorl %edx, %edi # ^d
1414 xorl %ebx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001415 addl -64+4*11(%rsp), %ebp # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001416 addl %edi, %ebp # e += (c ^ d ^ b)
1417 movl %eax, %esi #
1418 roll $5, %esi # rotl32(a,5)
1419 addl %esi, %ebp # e += rotl32(a,5)
1420 rorl $2, %ebx # b = rotl32(b,30)
1421# 76
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001422 movl %ebx, %edi # c
1423 xorl %ecx, %edi # ^d
1424 xorl %eax, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001425 addl -64+4*12(%rsp), %edx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001426 addl %edi, %edx # e += (c ^ d ^ b)
1427 movl %ebp, %esi #
1428 roll $5, %esi # rotl32(a,5)
1429 addl %esi, %edx # e += rotl32(a,5)
1430 rorl $2, %eax # b = rotl32(b,30)
1431# 77
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001432 movl %eax, %edi # c
1433 xorl %ebx, %edi # ^d
1434 xorl %ebp, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001435 addl -64+4*13(%rsp), %ecx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001436 addl %edi, %ecx # e += (c ^ d ^ b)
1437 movl %edx, %esi #
1438 roll $5, %esi # rotl32(a,5)
1439 addl %esi, %ecx # e += rotl32(a,5)
1440 rorl $2, %ebp # b = rotl32(b,30)
1441# 78
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001442 movl %ebp, %edi # c
1443 xorl %eax, %edi # ^d
1444 xorl %edx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001445 addl -64+4*14(%rsp), %ebx # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001446 addl %edi, %ebx # e += (c ^ d ^ b)
1447 movl %ecx, %esi #
1448 roll $5, %esi # rotl32(a,5)
1449 addl %esi, %ebx # e += rotl32(a,5)
1450 rorl $2, %edx # b = rotl32(b,30)
1451# 79
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001452 movl %edx, %edi # c
1453 xorl %ebp, %edi # ^d
1454 xorl %ecx, %edi # ^b
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001455 addl -64+4*15(%rsp), %eax # e += RCONST + W[n & 15]
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001456 addl %edi, %eax # e += (c ^ d ^ b)
1457 movl %ebx, %esi #
1458 roll $5, %esi # rotl32(a,5)
1459 addl %esi, %eax # e += rotl32(a,5)
1460 rorl $2, %ecx # b = rotl32(b,30)
1461
1462 popq %rdi #
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001463 popq %r12 #
Denys Vlasenko805ecec2022-01-08 00:41:09 +01001464 addl %eax, 80(%rdi) # ctx->hash[0] += a
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001465 popq %r13 #
Denys Vlasenko805ecec2022-01-08 00:41:09 +01001466 addl %ebx, 84(%rdi) # ctx->hash[1] += b
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001467 popq %r14 #
Denys Vlasenko805ecec2022-01-08 00:41:09 +01001468 addl %ecx, 88(%rdi) # ctx->hash[2] += c
Denys Vlasenko205042c2022-01-25 17:00:57 +01001469# popq %r15 #
Denys Vlasenko805ecec2022-01-08 00:41:09 +01001470 addl %edx, 92(%rdi) # ctx->hash[3] += d
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +01001471 popq %rbx #
Denys Vlasenko805ecec2022-01-08 00:41:09 +01001472 addl %ebp, 96(%rdi) # ctx->hash[4] += e
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +01001473 popq %rbp #
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001474
1475 ret
1476 .size sha1_process_block64, .-sha1_process_block64
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001477
1478 .section .rodata.cst16.sha1const, "aM", @progbits, 16
Denys Vlasenko6472ac92022-02-03 14:15:20 +01001479 .balign 16
Denys Vlasenko4923f742022-02-08 03:29:16 +01001480sha1const:
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001481 .long 0x5A827999
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001482 .long 0x6ED9EBA1
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001483 .long 0x8F1BBCDC
Denys Vlasenko39369ff2022-01-23 09:27:30 +01001484 .long 0xCA62C1D6
1485
Denys Vlasenko05fd13e2022-01-03 01:57:29 +01001486#endif