blob: 2366b046afa7407efb8d5feafc59e55ba9d48c06 [file] [log] [blame]
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +01001#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__i386__)
2/* The code is adapted from Linux kernel's source */
3
4// We use shorter insns, even though they are for "wrong"
5// data type (fp, not int).
6// For Intel, there is no penalty for doing it at all
Denys Vlasenko6f56fa12022-02-10 15:38:10 +01007// (CPUs which do have such penalty do not support SHA insns).
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +01008// For AMD, the penalty is one extra cycle
9// (allegedly: I failed to find measurable difference).
10
11//#define mova128 movdqa
12#define mova128 movaps
13//#define movu128 movdqu
14#define movu128 movups
15//#define xor128 pxor
16#define xor128 xorps
17//#define shuf128_32 pshufd
18#define shuf128_32 shufps
19
20#define extr128_32 pextrd
Denys Vlasenko805ecec2022-01-08 00:41:09 +010021//#define extr128_32 extractps # not shorter
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +010022
Denys Vlasenkodda77e82022-02-11 14:53:26 +010023// pshufb is a SSSE3 insn.
24// pinsrd, pextrd, extractps are SSE4.1 insns.
25// We do not check SSSE3/SSE4.1 in cpuid,
26// all SHA-capable CPUs support them as well.
27
Denys Vlasenko6472ac92022-02-03 14:15:20 +010028 .section .text.sha1_process_block64_shaNI, "ax", @progbits
Denys Vlasenko805ecec2022-01-08 00:41:09 +010029 .globl sha1_process_block64_shaNI
30 .hidden sha1_process_block64_shaNI
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +010031 .type sha1_process_block64_shaNI, @function
32
33#define ABCD %xmm0
34#define E0 %xmm1 /* Need two E's b/c they ping pong */
35#define E1 %xmm2
36#define MSG0 %xmm3
37#define MSG1 %xmm4
38#define MSG2 %xmm5
39#define MSG3 %xmm6
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +010040
Denys Vlasenko71a1ccc2022-02-08 08:20:27 +010041 .balign 8 # allow decoders to fetch at least 2 first insns
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +010042sha1_process_block64_shaNI:
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +010043 /* load initial hash values */
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +010044 movu128 76(%eax), ABCD
Denys Vlasenkoeb52e7f2022-02-08 15:23:26 +010045 xor128 E0, E0
Denys Vlasenko0bab5da2022-01-07 14:55:31 +010046 pinsrd $3, 76+4*4(%eax), E0 # load to uppermost 32-bit word
47 shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +010048
Denys Vlasenko71a1ccc2022-02-08 08:20:27 +010049 mova128 PSHUFFLE_BYTE_FLIP_MASK, %xmm7
50
51 movu128 0*16(%eax), MSG0
52 pshufb %xmm7, MSG0
53 movu128 1*16(%eax), MSG1
54 pshufb %xmm7, MSG1
55 movu128 2*16(%eax), MSG2
56 pshufb %xmm7, MSG2
57 movu128 3*16(%eax), MSG3
58 pshufb %xmm7, MSG3
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +010059
60 /* Save hash values for addition after rounds */
Denys Vlasenkoeb8d5f32022-02-08 15:34:02 +010061 mova128 E0, %xmm7
62 /*mova128 ABCD, %xmm8 - NOPE, 32bit has no xmm8 */
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +010063
64 /* Rounds 0-3 */
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +010065 paddd MSG0, E0
66 mova128 ABCD, E1
67 sha1rnds4 $0, E0, ABCD
68
69 /* Rounds 4-7 */
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +010070 sha1nexte MSG1, E1
71 mova128 ABCD, E0
72 sha1rnds4 $0, E1, ABCD
73 sha1msg1 MSG1, MSG0
74
75 /* Rounds 8-11 */
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +010076 sha1nexte MSG2, E0
77 mova128 ABCD, E1
78 sha1rnds4 $0, E0, ABCD
79 sha1msg1 MSG2, MSG1
80 xor128 MSG2, MSG0
81
82 /* Rounds 12-15 */
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +010083 sha1nexte MSG3, E1
84 mova128 ABCD, E0
85 sha1msg2 MSG3, MSG0
86 sha1rnds4 $0, E1, ABCD
87 sha1msg1 MSG3, MSG2
88 xor128 MSG3, MSG1
89
90 /* Rounds 16-19 */
91 sha1nexte MSG0, E0
92 mova128 ABCD, E1
93 sha1msg2 MSG0, MSG1
94 sha1rnds4 $0, E0, ABCD
95 sha1msg1 MSG0, MSG3
96 xor128 MSG0, MSG2
97
98 /* Rounds 20-23 */
99 sha1nexte MSG1, E1
100 mova128 ABCD, E0
101 sha1msg2 MSG1, MSG2
102 sha1rnds4 $1, E1, ABCD
103 sha1msg1 MSG1, MSG0
104 xor128 MSG1, MSG3
105
106 /* Rounds 24-27 */
107 sha1nexte MSG2, E0
108 mova128 ABCD, E1
109 sha1msg2 MSG2, MSG3
110 sha1rnds4 $1, E0, ABCD
111 sha1msg1 MSG2, MSG1
112 xor128 MSG2, MSG0
113
114 /* Rounds 28-31 */
115 sha1nexte MSG3, E1
116 mova128 ABCD, E0
117 sha1msg2 MSG3, MSG0
118 sha1rnds4 $1, E1, ABCD
119 sha1msg1 MSG3, MSG2
120 xor128 MSG3, MSG1
121
122 /* Rounds 32-35 */
123 sha1nexte MSG0, E0
124 mova128 ABCD, E1
125 sha1msg2 MSG0, MSG1
126 sha1rnds4 $1, E0, ABCD
127 sha1msg1 MSG0, MSG3
128 xor128 MSG0, MSG2
129
130 /* Rounds 36-39 */
131 sha1nexte MSG1, E1
132 mova128 ABCD, E0
133 sha1msg2 MSG1, MSG2
134 sha1rnds4 $1, E1, ABCD
135 sha1msg1 MSG1, MSG0
136 xor128 MSG1, MSG3
137
138 /* Rounds 40-43 */
139 sha1nexte MSG2, E0
140 mova128 ABCD, E1
141 sha1msg2 MSG2, MSG3
142 sha1rnds4 $2, E0, ABCD
143 sha1msg1 MSG2, MSG1
144 xor128 MSG2, MSG0
145
146 /* Rounds 44-47 */
147 sha1nexte MSG3, E1
148 mova128 ABCD, E0
149 sha1msg2 MSG3, MSG0
150 sha1rnds4 $2, E1, ABCD
151 sha1msg1 MSG3, MSG2
152 xor128 MSG3, MSG1
153
154 /* Rounds 48-51 */
155 sha1nexte MSG0, E0
156 mova128 ABCD, E1
157 sha1msg2 MSG0, MSG1
158 sha1rnds4 $2, E0, ABCD
159 sha1msg1 MSG0, MSG3
160 xor128 MSG0, MSG2
161
162 /* Rounds 52-55 */
163 sha1nexte MSG1, E1
164 mova128 ABCD, E0
165 sha1msg2 MSG1, MSG2
166 sha1rnds4 $2, E1, ABCD
167 sha1msg1 MSG1, MSG0
168 xor128 MSG1, MSG3
169
170 /* Rounds 56-59 */
171 sha1nexte MSG2, E0
172 mova128 ABCD, E1
173 sha1msg2 MSG2, MSG3
174 sha1rnds4 $2, E0, ABCD
175 sha1msg1 MSG2, MSG1
176 xor128 MSG2, MSG0
177
178 /* Rounds 60-63 */
179 sha1nexte MSG3, E1
180 mova128 ABCD, E0
181 sha1msg2 MSG3, MSG0
182 sha1rnds4 $3, E1, ABCD
183 sha1msg1 MSG3, MSG2
184 xor128 MSG3, MSG1
185
186 /* Rounds 64-67 */
187 sha1nexte MSG0, E0
188 mova128 ABCD, E1
189 sha1msg2 MSG0, MSG1
190 sha1rnds4 $3, E0, ABCD
191 sha1msg1 MSG0, MSG3
192 xor128 MSG0, MSG2
193
194 /* Rounds 68-71 */
195 sha1nexte MSG1, E1
196 mova128 ABCD, E0
197 sha1msg2 MSG1, MSG2
198 sha1rnds4 $3, E1, ABCD
199 xor128 MSG1, MSG3
200
201 /* Rounds 72-75 */
202 sha1nexte MSG2, E0
203 mova128 ABCD, E1
204 sha1msg2 MSG2, MSG3
205 sha1rnds4 $3, E0, ABCD
206
207 /* Rounds 76-79 */
208 sha1nexte MSG3, E1
209 mova128 ABCD, E0
210 sha1rnds4 $3, E1, ABCD
211
212 /* Add current hash values with previously saved */
Denys Vlasenko71a1ccc2022-02-08 08:20:27 +0100213 sha1nexte %xmm7, E0
Denys Vlasenkoeb52e7f2022-02-08 15:23:26 +0100214 /*paddd %xmm8, ABCD - 32-bit mode has no xmm8 */
Denys Vlasenkoeb8d5f32022-02-08 15:34:02 +0100215 movu128 76(%eax), %xmm7 # get original ABCD (not shuffled)...
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +0100216
217 /* Write hash values back in the correct order */
218 shuf128_32 $0x1B, ABCD, ABCD
Denys Vlasenkoeb8d5f32022-02-08 15:34:02 +0100219 paddd %xmm7, ABCD # ...add it to final ABCD
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +0100220 movu128 ABCD, 76(%eax)
221 extr128_32 $3, E0, 76+4*4(%eax)
222
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +0100223 ret
Denys Vlasenko805ecec2022-01-08 00:41:09 +0100224 .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +0100225
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100226 .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
Denys Vlasenko6a6c1c02022-02-09 11:29:23 +0100227 .balign 16
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +0100228PSHUFFLE_BYTE_FLIP_MASK:
Denys Vlasenko6a6c1c02022-02-09 11:29:23 +0100229 .octa 0x000102030405060708090a0b0c0d0e0f
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +0100230
231#endif