blob: 901896e6e18f842e009781df83749fce81a7dfb2 [file] [log] [blame]
Denys Vlasenko947bef02022-01-03 13:00:07 +01001#!/bin/sh
2
3# We don't regenerate it on every "make" invocation - only by hand.
4# The reason is that the changes to generated code are difficult
5# to visualize by looking only at this script, it helps when the commit
6# also contains the diff of the generated file.
7exec >hash_md5_sha_x86-64.S
8
Denys Vlasenko14335682022-01-08 22:43:24 +01009# There is a way to use XMM registers (which always exist for x86-64!) for W[]
10# For example, if we load W as follows:
11# %xmm0: w[0x0] w[0x1] w[0x2] w[0x3]
12# %xmm4: w[0x4] w[0x5] w[0x6] w[0x7]
13# %xmm8: w[0x8] w[0x9] w[0xa] w[0xb]
14# %xmm12: w[0xc] w[0xd] w[0xe] w[0xf]
15# then the xor'ing operation to generate next W[0..3] is:
16# movaps %xmm0, %xmmT2
17# palignr $0x8, %xmm4, %xmmT2 # form (w[0x2],w[0x3],w[0x4],w[0x5])
18# # Right-shifts xmm4:xmmT2 by 8 bytes. Writes shifted result to xmmT2. SSSE3 insn.
19# movaps %xmm0, %xmmT13
20# palignr $0x4,%xmm0,%xmmT13 # form (w[0xd],w[0xe],w[0xf],w[0x0])
21# xmm0 = xmm0 ^ t2 ^ xmm8 ^ t13
22# xmm0 = rol32(xmm0,1) # no such insn, have to use pslld+psrld+or
23# and then results can be extracted for use:
24# movd %xmm0, %esi # new W[0]
25# pextrd $1, %xmm0, %esi # new W[1]
26# # SSE4.1 insn. Can use EXTRACTPS (also SSE4.1)
27# pextrd $2, %xmm0, %esi # new W[2]
28# pextrd $3, %xmm0, %esi # new W[3]
29# ... but this requires SSE4.1 and SSSE3, which are not universally available on x86-64.
30
Denys Vlasenko947bef02022-01-03 13:00:07 +010031echo \
32'### Generated by hash_md5_sha_x86-64.S.sh ###
33
34#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
35 .section .text.sha1_process_block64,"ax",@progbits
Denys Vlasenko805ecec2022-01-08 00:41:09 +010036 .globl sha1_process_block64
37 .hidden sha1_process_block64
Denys Vlasenko947bef02022-01-03 13:00:07 +010038 .type sha1_process_block64, @function
39
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +010040 .balign 8 # allow decoders to fetch at least 5 first insns
Denys Vlasenko947bef02022-01-03 13:00:07 +010041sha1_process_block64:
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +010042 pushq %rbp # 1 byte insn
43 pushq %rbx # 1 byte insn
44 pushq %r15 # 2 byte insn
45 pushq %r14 # 2 byte insn
46 pushq %r13 # 2 byte insn
47 pushq %r12 # 2 byte insn
Denys Vlasenko947bef02022-01-03 13:00:07 +010048 pushq %rdi # we need ctx at the end
49
50#Register and stack use:
51# eax..edx: a..d
52# ebp: e
53# esi,edi: temps
54# -32+4*n(%rsp),r8...r15: W[0..7,8..15]
Denys Vlasenko43870772022-01-03 13:14:09 +010055# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?)
Denys Vlasenko947bef02022-01-03 13:00:07 +010056 movl $3, %eax
571:
58 movq (%rdi,%rax,8), %rsi
59 bswapq %rsi
60 rolq $32, %rsi
61 movq %rsi, -32(%rsp,%rax,8)
62 decl %eax
63 jns 1b
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +010064
Denys Vlasenko947bef02022-01-03 13:00:07 +010065 movl 80(%rdi), %eax # a = ctx->hash[0]
66 movl 84(%rdi), %ebx # b = ctx->hash[1]
67 movl 88(%rdi), %ecx # c = ctx->hash[2]
68 movl 92(%rdi), %edx # d = ctx->hash[3]
69 movl 96(%rdi), %ebp # e = ctx->hash[4]
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +010070
71 movq 4*8(%rdi), %r8
72 movq 4*10(%rdi), %r10
73 bswapq %r8
74 bswapq %r10
75 movq 4*12(%rdi), %r12
76 movq 4*14(%rdi), %r14
77 bswapq %r12
78 bswapq %r14
79 movl %r8d, %r9d
80 shrq $32, %r8
81 movl %r10d, %r11d
82 shrq $32, %r10
83 movl %r12d, %r13d
84 shrq $32, %r12
85 movl %r14d, %r15d
86 shrq $32, %r14
Denys Vlasenko947bef02022-01-03 13:00:07 +010087'
88W32() {
89test "$1" || exit 1
90test "$1" -lt 0 && exit 1
91test "$1" -gt 15 && exit 1
92test "$1" -lt 8 && echo "-32+4*$1(%rsp)"
93test "$1" -ge 8 && echo "%r${1}d"
94}
95
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +010096# It's possible to interleave insns in rounds to mostly eliminate
97# dependency chains, but this likely to only help old Pentium-based
98# CPUs (ones without OOO, which can only simultaneously execute a pair
99# of _adjacent_ insns).
100# Testing on old-ish Silvermont CPU (which has OOO window of only
101# about ~8 insns) shows very small (~1%) speedup.
102
Denys Vlasenko947bef02022-01-03 13:00:07 +0100103RD1A() {
104local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
105local n=$(($6))
Denys Vlasenko7abb2bb2022-01-03 17:02:48 +0100106local n0=$(((n+0) & 15))
107echo "
108# $n
109";test $n0 = 0 && echo "
Denys Vlasenko947bef02022-01-03 13:00:07 +0100110 # W[0], already in %esi
Denys Vlasenko7abb2bb2022-01-03 17:02:48 +0100111";test $n0 != 0 && test $n0 -lt 8 && echo "
112 movl `W32 $n0`, %esi # W[n]
113";test $n0 -ge 8 && echo "
114 # W[n], in %r$n0
Denys Vlasenko947bef02022-01-03 13:00:07 +0100115";echo "
116 movl %e$c, %edi # c
117 xorl %e$d, %edi # ^d
118 andl %e$b, %edi # &b
119 xorl %e$d, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko7abb2bb2022-01-03 17:02:48 +0100120";test $n0 -lt 8 && echo "
121 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
122";test $n0 -ge 8 && echo "
123 leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n]
Denys Vlasenko947bef02022-01-03 13:00:07 +0100124";echo "
125 addl %edi, %e$e # e += (((c ^ d) & b) ^ d)
126 movl %e$a, %esi #
127 roll \$5, %esi # rotl32(a,5)
128 addl %esi, %e$e # e += rotl32(a,5)
129 rorl \$2, %e$b # b = rotl32(b,30)
130"
131}
132RD1B() {
133local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
134local n=$(($6))
135local n13=$(((n+13) & 15))
136local n8=$(((n+8) & 15))
137local n2=$(((n+2) & 15))
138local n0=$(((n+0) & 15))
139echo "
140# $n
141";test $n0 -lt 8 && echo "
142 movl `W32 $n13`, %esi # W[(n+13) & 15]
143 xorl `W32 $n8`, %esi # ^W[(n+8) & 15]
144 xorl `W32 $n2`, %esi # ^W[(n+2) & 15]
145 xorl `W32 $n0`, %esi # ^W[n & 15]
146 roll %esi #
147 movl %esi, `W32 $n0` # store to W[n & 15]
148";test $n0 -ge 8 && echo "
149 xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15]
150 xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15]
151 xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15]
152 roll `W32 $n0` #
Denys Vlasenko7abb2bb2022-01-03 17:02:48 +0100153";echo "
Denys Vlasenko947bef02022-01-03 13:00:07 +0100154 movl %e$c, %edi # c
155 xorl %e$d, %edi # ^d
156 andl %e$b, %edi # &b
157 xorl %e$d, %edi # (((c ^ d) & b) ^ d)
158";test $n0 -lt 8 && echo "
159 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15]
160";test $n0 -ge 8 && echo "
161 leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15]
162";echo "
163 addl %edi, %e$e # e += (((c ^ d) & b) ^ d)
164 movl %e$a, %esi #
165 roll \$5, %esi # rotl32(a,5)
166 addl %esi, %e$e # e += rotl32(a,5)
167 rorl \$2, %e$b # b = rotl32(b,30)
168"
169}
170{
171RCONST=0x5A827999
172RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3; RD1A bx cx dx bp ax 4
173RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7; RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9
174RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11; RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14
175RD1A ax bx cx dx bp 15; RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19
176} | grep -v '^$'
177
178RD2() {
179local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
180local n=$(($6))
181local n13=$(((n+13) & 15))
182local n8=$(((n+8) & 15))
183local n2=$(((n+2) & 15))
184local n0=$(((n+0) & 15))
185echo "
186# $n
187";test $n0 -lt 8 && echo "
188 movl `W32 $n13`, %esi # W[(n+13) & 15]
189 xorl `W32 $n8`, %esi # ^W[(n+8) & 15]
190 xorl `W32 $n2`, %esi # ^W[(n+2) & 15]
191 xorl `W32 $n0`, %esi # ^W[n & 15]
192 roll %esi #
193 movl %esi, `W32 $n0` # store to W[n & 15]
194";test $n0 -ge 8 && echo "
195 xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15]
196 xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15]
197 xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15]
198 roll `W32 $n0` #
Denys Vlasenko7abb2bb2022-01-03 17:02:48 +0100199";echo "
Denys Vlasenko947bef02022-01-03 13:00:07 +0100200 movl %e$c, %edi # c
201 xorl %e$d, %edi # ^d
202 xorl %e$b, %edi # ^b
203";test $n0 -lt 8 && echo "
204 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15]
205";test $n0 -ge 8 && echo "
206 leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15]
207";echo "
208 addl %edi, %e$e # e += (c ^ d ^ b)
209 movl %e$a, %esi #
210 roll \$5, %esi # rotl32(a,5)
211 addl %esi, %e$e # e += rotl32(a,5)
212 rorl \$2, %e$b # b = rotl32(b,30)
213"
214}
215{
216RCONST=0x6ED9EBA1
217RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23; RD2 bx cx dx bp ax 24
218RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27; RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29
219RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31; RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34
220RD2 ax bx cx dx bp 35; RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39
221} | grep -v '^$'
222
223RD3() {
224local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
225local n=$(($6))
226local n13=$(((n+13) & 15))
227local n8=$(((n+8) & 15))
228local n2=$(((n+2) & 15))
229local n0=$(((n+0) & 15))
230echo "
231# $n
232 movl %e$b, %edi # di: b
233 movl %e$b, %esi # si: b
234 orl %e$c, %edi # di: b | c
235 andl %e$c, %esi # si: b & c
236 andl %e$d, %edi # di: (b | c) & d
237 orl %esi, %edi # ((b | c) & d) | (b & c)
238";test $n0 -lt 8 && echo "
239 movl `W32 $n13`, %esi # W[(n+13) & 15]
240 xorl `W32 $n8`, %esi # ^W[(n+8) & 15]
241 xorl `W32 $n2`, %esi # ^W[(n+2) & 15]
242 xorl `W32 $n0`, %esi # ^W[n & 15]
243 roll %esi #
244 movl %esi, `W32 $n0` # store to W[n & 15]
245";test $n0 -ge 8 && echo "
246 xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15]
247 xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15]
248 xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15]
249 roll `W32 $n0` #
Denys Vlasenko7abb2bb2022-01-03 17:02:48 +0100250";echo "
Denys Vlasenko947bef02022-01-03 13:00:07 +0100251 addl %edi, %e$e # += ((b | c) & d) | (b & c)
252";test $n0 -lt 8 && echo "
253 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15]
254";test $n0 -ge 8 && echo "
255 leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15]
256";echo "
257 movl %e$a, %esi #
258 roll \$5, %esi # rotl32(a,5)
259 addl %esi, %e$e # e += rotl32(a,5)
260 rorl \$2, %e$b # b = rotl32(b,30)
261"
262}
263{
264#RCONST=0x8F1BBCDC "out of range for signed 32bit displacement"
265RCONST=-0x70E44324
266RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43; RD3 bx cx dx bp ax 44
267RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47; RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49
268RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51; RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54
269RD3 ax bx cx dx bp 55; RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59
270} | grep -v '^$'
271
272# Round 4 has the same logic as round 2, only n and RCONST are different
273{
274#RCONST=0xCA62C1D6 "out of range for signed 32bit displacement"
275RCONST=-0x359D3E2A
276RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63; RD2 bx cx dx bp ax 64
277RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67; RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69
278RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71; RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74
279RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79
Denys Vlasenko7abb2bb2022-01-03 17:02:48 +0100280# Note: new W[n&15] values generated in last 3 iterations
281# (W[13,14,15]) are unused after each of these iterations.
282# Since we use r8..r15 for W[8..15], this does not matter.
283# If we switch to e.g. using r8..r15 for W[0..7], then saving of W[13,14,15]
284# (the "movl %esi, `W32 $n0`" insn) is a dead store and can be removed.
Denys Vlasenko947bef02022-01-03 13:00:07 +0100285} | grep -v '^$'
286
287echo "
288 popq %rdi #
Denys Vlasenko947bef02022-01-03 13:00:07 +0100289 popq %r12 #
Denys Vlasenko805ecec2022-01-08 00:41:09 +0100290 addl %eax, 80(%rdi) # ctx->hash[0] += a
Denys Vlasenko947bef02022-01-03 13:00:07 +0100291 popq %r13 #
Denys Vlasenko805ecec2022-01-08 00:41:09 +0100292 addl %ebx, 84(%rdi) # ctx->hash[1] += b
Denys Vlasenko947bef02022-01-03 13:00:07 +0100293 popq %r14 #
Denys Vlasenko805ecec2022-01-08 00:41:09 +0100294 addl %ecx, 88(%rdi) # ctx->hash[2] += c
Denys Vlasenko947bef02022-01-03 13:00:07 +0100295 popq %r15 #
Denys Vlasenko805ecec2022-01-08 00:41:09 +0100296 addl %edx, 92(%rdi) # ctx->hash[3] += d
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +0100297 popq %rbx #
Denys Vlasenko805ecec2022-01-08 00:41:09 +0100298 addl %ebp, 96(%rdi) # ctx->hash[4] += e
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +0100299 popq %rbp #
Denys Vlasenko947bef02022-01-03 13:00:07 +0100300
301 ret
302 .size sha1_process_block64, .-sha1_process_block64
303#endif"