blob: 7e50b64fb8d7674554a6521f3035031ecadb39de [file] [log] [blame]
Denys Vlasenko947bef02022-01-03 13:00:07 +01001#!/bin/sh
2
3# We don't regenerate it on every "make" invocation - only by hand.
4# The reason is that the changes to generated code are difficult
5# to visualize by looking only at this script, it helps when the commit
6# also contains the diff of the generated file.
7exec >hash_md5_sha_x86-64.S
8
9echo \
10'### Generated by hash_md5_sha_x86-64.S.sh ###
11
12#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
13 .section .text.sha1_process_block64,"ax",@progbits
14 .globl sha1_process_block64
15 .hidden sha1_process_block64
16 .type sha1_process_block64, @function
17
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +010018 .balign 8 # allow decoders to fetch at least 5 first insns
Denys Vlasenko947bef02022-01-03 13:00:07 +010019sha1_process_block64:
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +010020 pushq %rbp # 1 byte insn
21 pushq %rbx # 1 byte insn
22 pushq %r15 # 2 byte insn
23 pushq %r14 # 2 byte insn
24 pushq %r13 # 2 byte insn
25 pushq %r12 # 2 byte insn
Denys Vlasenko947bef02022-01-03 13:00:07 +010026 pushq %rdi # we need ctx at the end
27
28#Register and stack use:
29# eax..edx: a..d
30# ebp: e
31# esi,edi: temps
32# -32+4*n(%rsp),r8...r15: W[0..7,8..15]
Denys Vlasenko43870772022-01-03 13:14:09 +010033# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?)
Denys Vlasenko947bef02022-01-03 13:00:07 +010034 movl $3, %eax
351:
36 movq (%rdi,%rax,8), %rsi
37 bswapq %rsi
38 rolq $32, %rsi
39 movq %rsi, -32(%rsp,%rax,8)
40 decl %eax
41 jns 1b
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +010042
Denys Vlasenko947bef02022-01-03 13:00:07 +010043 movl 80(%rdi), %eax # a = ctx->hash[0]
44 movl 84(%rdi), %ebx # b = ctx->hash[1]
45 movl 88(%rdi), %ecx # c = ctx->hash[2]
46 movl 92(%rdi), %edx # d = ctx->hash[3]
47 movl 96(%rdi), %ebp # e = ctx->hash[4]
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +010048
49 movq 4*8(%rdi), %r8
50 movq 4*10(%rdi), %r10
51 bswapq %r8
52 bswapq %r10
53 movq 4*12(%rdi), %r12
54 movq 4*14(%rdi), %r14
55 bswapq %r12
56 bswapq %r14
57 movl %r8d, %r9d
58 shrq $32, %r8
59 movl %r10d, %r11d
60 shrq $32, %r10
61 movl %r12d, %r13d
62 shrq $32, %r12
63 movl %r14d, %r15d
64 shrq $32, %r14
Denys Vlasenko947bef02022-01-03 13:00:07 +010065'
66W32() {
67test "$1" || exit 1
68test "$1" -lt 0 && exit 1
69test "$1" -gt 15 && exit 1
70test "$1" -lt 8 && echo "-32+4*$1(%rsp)"
71test "$1" -ge 8 && echo "%r${1}d"
72}
73
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +010074# It's possible to interleave insns in rounds to mostly eliminate
75# dependency chains, but this likely to only help old Pentium-based
76# CPUs (ones without OOO, which can only simultaneously execute a pair
77# of _adjacent_ insns).
78# Testing on old-ish Silvermont CPU (which has OOO window of only
79# about ~8 insns) shows very small (~1%) speedup.
80
Denys Vlasenko947bef02022-01-03 13:00:07 +010081RD1A() {
82local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
83local n=$(($6))
Denys Vlasenko7abb2bb2022-01-03 17:02:48 +010084local n0=$(((n+0) & 15))
85echo "
86# $n
87";test $n0 = 0 && echo "
Denys Vlasenko947bef02022-01-03 13:00:07 +010088 # W[0], already in %esi
Denys Vlasenko7abb2bb2022-01-03 17:02:48 +010089";test $n0 != 0 && test $n0 -lt 8 && echo "
90 movl `W32 $n0`, %esi # W[n]
91";test $n0 -ge 8 && echo "
92 # W[n], in %r$n0
Denys Vlasenko947bef02022-01-03 13:00:07 +010093";echo "
94 movl %e$c, %edi # c
95 xorl %e$d, %edi # ^d
96 andl %e$b, %edi # &b
97 xorl %e$d, %edi # (((c ^ d) & b) ^ d)
Denys Vlasenko7abb2bb2022-01-03 17:02:48 +010098";test $n0 -lt 8 && echo "
99 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
100";test $n0 -ge 8 && echo "
101 leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n]
Denys Vlasenko947bef02022-01-03 13:00:07 +0100102";echo "
103 addl %edi, %e$e # e += (((c ^ d) & b) ^ d)
104 movl %e$a, %esi #
105 roll \$5, %esi # rotl32(a,5)
106 addl %esi, %e$e # e += rotl32(a,5)
107 rorl \$2, %e$b # b = rotl32(b,30)
108"
109}
110RD1B() {
111local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
112local n=$(($6))
113local n13=$(((n+13) & 15))
114local n8=$(((n+8) & 15))
115local n2=$(((n+2) & 15))
116local n0=$(((n+0) & 15))
117echo "
118# $n
119";test $n0 -lt 8 && echo "
120 movl `W32 $n13`, %esi # W[(n+13) & 15]
121 xorl `W32 $n8`, %esi # ^W[(n+8) & 15]
122 xorl `W32 $n2`, %esi # ^W[(n+2) & 15]
123 xorl `W32 $n0`, %esi # ^W[n & 15]
124 roll %esi #
125 movl %esi, `W32 $n0` # store to W[n & 15]
126";test $n0 -ge 8 && echo "
127 xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15]
128 xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15]
129 xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15]
130 roll `W32 $n0` #
Denys Vlasenko7abb2bb2022-01-03 17:02:48 +0100131";echo "
Denys Vlasenko947bef02022-01-03 13:00:07 +0100132 movl %e$c, %edi # c
133 xorl %e$d, %edi # ^d
134 andl %e$b, %edi # &b
135 xorl %e$d, %edi # (((c ^ d) & b) ^ d)
136";test $n0 -lt 8 && echo "
137 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15]
138";test $n0 -ge 8 && echo "
139 leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15]
140";echo "
141 addl %edi, %e$e # e += (((c ^ d) & b) ^ d)
142 movl %e$a, %esi #
143 roll \$5, %esi # rotl32(a,5)
144 addl %esi, %e$e # e += rotl32(a,5)
145 rorl \$2, %e$b # b = rotl32(b,30)
146"
147}
148{
149RCONST=0x5A827999
150RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3; RD1A bx cx dx bp ax 4
151RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7; RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9
152RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11; RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14
153RD1A ax bx cx dx bp 15; RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19
154} | grep -v '^$'
155
156RD2() {
157local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
158local n=$(($6))
159local n13=$(((n+13) & 15))
160local n8=$(((n+8) & 15))
161local n2=$(((n+2) & 15))
162local n0=$(((n+0) & 15))
163echo "
164# $n
165";test $n0 -lt 8 && echo "
166 movl `W32 $n13`, %esi # W[(n+13) & 15]
167 xorl `W32 $n8`, %esi # ^W[(n+8) & 15]
168 xorl `W32 $n2`, %esi # ^W[(n+2) & 15]
169 xorl `W32 $n0`, %esi # ^W[n & 15]
170 roll %esi #
171 movl %esi, `W32 $n0` # store to W[n & 15]
172";test $n0 -ge 8 && echo "
173 xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15]
174 xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15]
175 xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15]
176 roll `W32 $n0` #
Denys Vlasenko7abb2bb2022-01-03 17:02:48 +0100177";echo "
Denys Vlasenko947bef02022-01-03 13:00:07 +0100178 movl %e$c, %edi # c
179 xorl %e$d, %edi # ^d
180 xorl %e$b, %edi # ^b
181";test $n0 -lt 8 && echo "
182 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15]
183";test $n0 -ge 8 && echo "
184 leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15]
185";echo "
186 addl %edi, %e$e # e += (c ^ d ^ b)
187 movl %e$a, %esi #
188 roll \$5, %esi # rotl32(a,5)
189 addl %esi, %e$e # e += rotl32(a,5)
190 rorl \$2, %e$b # b = rotl32(b,30)
191"
192}
193{
194RCONST=0x6ED9EBA1
195RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23; RD2 bx cx dx bp ax 24
196RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27; RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29
197RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31; RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34
198RD2 ax bx cx dx bp 35; RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39
199} | grep -v '^$'
200
201RD3() {
202local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
203local n=$(($6))
204local n13=$(((n+13) & 15))
205local n8=$(((n+8) & 15))
206local n2=$(((n+2) & 15))
207local n0=$(((n+0) & 15))
208echo "
209# $n
210 movl %e$b, %edi # di: b
211 movl %e$b, %esi # si: b
212 orl %e$c, %edi # di: b | c
213 andl %e$c, %esi # si: b & c
214 andl %e$d, %edi # di: (b | c) & d
215 orl %esi, %edi # ((b | c) & d) | (b & c)
216";test $n0 -lt 8 && echo "
217 movl `W32 $n13`, %esi # W[(n+13) & 15]
218 xorl `W32 $n8`, %esi # ^W[(n+8) & 15]
219 xorl `W32 $n2`, %esi # ^W[(n+2) & 15]
220 xorl `W32 $n0`, %esi # ^W[n & 15]
221 roll %esi #
222 movl %esi, `W32 $n0` # store to W[n & 15]
223";test $n0 -ge 8 && echo "
224 xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15]
225 xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15]
226 xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15]
227 roll `W32 $n0` #
Denys Vlasenko7abb2bb2022-01-03 17:02:48 +0100228";echo "
Denys Vlasenko947bef02022-01-03 13:00:07 +0100229 addl %edi, %e$e # += ((b | c) & d) | (b & c)
230";test $n0 -lt 8 && echo "
231 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15]
232";test $n0 -ge 8 && echo "
233 leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15]
234";echo "
235 movl %e$a, %esi #
236 roll \$5, %esi # rotl32(a,5)
237 addl %esi, %e$e # e += rotl32(a,5)
238 rorl \$2, %e$b # b = rotl32(b,30)
239"
240}
241{
242#RCONST=0x8F1BBCDC "out of range for signed 32bit displacement"
243RCONST=-0x70E44324
244RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43; RD3 bx cx dx bp ax 44
245RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47; RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49
246RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51; RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54
247RD3 ax bx cx dx bp 55; RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59
248} | grep -v '^$'
249
250# Round 4 has the same logic as round 2, only n and RCONST are different
251{
252#RCONST=0xCA62C1D6 "out of range for signed 32bit displacement"
253RCONST=-0x359D3E2A
254RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63; RD2 bx cx dx bp ax 64
255RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67; RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69
256RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71; RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74
257RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79
Denys Vlasenko7abb2bb2022-01-03 17:02:48 +0100258# Note: new W[n&15] values generated in last 3 iterations
259# (W[13,14,15]) are unused after each of these iterations.
260# Since we use r8..r15 for W[8..15], this does not matter.
261# If we switch to e.g. using r8..r15 for W[0..7], then saving of W[13,14,15]
262# (the "movl %esi, `W32 $n0`" insn) is a dead store and can be removed.
Denys Vlasenko947bef02022-01-03 13:00:07 +0100263} | grep -v '^$'
264
265echo "
266 popq %rdi #
Denys Vlasenko947bef02022-01-03 13:00:07 +0100267 popq %r12 #
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +0100268 addl %eax, 80(%rdi) # ctx->hash[0] += a
Denys Vlasenko947bef02022-01-03 13:00:07 +0100269 popq %r13 #
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +0100270 addl %ebx, 84(%rdi) # ctx->hash[1] += b
Denys Vlasenko947bef02022-01-03 13:00:07 +0100271 popq %r14 #
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +0100272 addl %ecx, 88(%rdi) # ctx->hash[2] += c
Denys Vlasenko947bef02022-01-03 13:00:07 +0100273 popq %r15 #
Denys Vlasenkoc3cfcc92022-01-04 01:45:13 +0100274 addl %edx, 92(%rdi) # ctx->hash[3] += d
275 popq %rbx #
276 addl %ebp, 96(%rdi) # ctx->hash[4] += e
277 popq %rbp #
Denys Vlasenko947bef02022-01-03 13:00:07 +0100278
279 ret
280 .size sha1_process_block64, .-sha1_process_block64
281#endif"