blob: 632dab7e6d2a66e9170c8e4664f850d67c74040e [file] [log] [blame]
Denys Vlasenko6472ac92022-02-03 14:15:20 +01001#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__)
2/* The code is adapted from Linux kernel's source */
3
4// We use shorter insns, even though they are for "wrong"
5// data type (fp, not int).
6// For Intel, there is no penalty for doing it at all
7// (CPUs which do have such penalty do not support SHA1 insns).
8// For AMD, the penalty is one extra cycle
9// (allegedly: I failed to find measurable difference).
10
11//#define mova128 movdqa
12#define mova128 movaps
13//#define movu128 movdqu
14#define movu128 movups
15//#define shuf128_32 pshufd
16#define shuf128_32 shufps
17
18 .section .text.sha256_process_block64_shaNI, "ax", @progbits
19 .globl sha256_process_block64_shaNI
20 .hidden sha256_process_block64_shaNI
21 .type sha256_process_block64_shaNI, @function
22
23#define DATA_PTR %eax
24
25#define SHA256CONSTANTS %ecx
26
27#define MSG %xmm0
28#define STATE0 %xmm1
29#define STATE1 %xmm2
30#define MSGTMP0 %xmm3
31#define MSGTMP1 %xmm4
32#define MSGTMP2 %xmm5
33#define MSGTMP3 %xmm6
34#define MSGTMP4 %xmm7
35
36 .balign 8 # allow decoders to fetch at least 3 first insns
37sha256_process_block64_shaNI:
38 pushl %ebp
39 movl %esp, %ebp
40 subl $32, %esp
41 andl $~0xF, %esp # paddd needs aligned memory operand
42
43 movu128 76+0*16(%eax), STATE0
44 movu128 76+1*16(%eax), STATE1
45
46 shuf128_32 $0xB1, STATE0, STATE0 /* CDAB */
47 shuf128_32 $0x1B, STATE1, STATE1 /* EFGH */
48 mova128 STATE0, MSGTMP4
49 palignr $8, STATE1, STATE0 /* ABEF */
50 pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */
51
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +010052 movl $K256+8*16, SHA256CONSTANTS
Denys Vlasenko6472ac92022-02-03 14:15:20 +010053
54 /* Save hash values for addition after rounds */
55 mova128 STATE0, 0*16(%esp)
56 mova128 STATE1, 1*16(%esp)
57
58 /* Rounds 0-3 */
59 movu128 0*16(DATA_PTR), MSG
60 pshufb PSHUFFLE_BSWAP32_FLIP_MASK, MSG
61 mova128 MSG, MSGTMP0
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +010062 paddd 0*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010063 sha256rnds2 STATE0, STATE1
64 shuf128_32 $0x0E, MSG, MSG
65 sha256rnds2 STATE1, STATE0
66
67 /* Rounds 4-7 */
68 movu128 1*16(DATA_PTR), MSG
69 pshufb PSHUFFLE_BSWAP32_FLIP_MASK, MSG
70 mova128 MSG, MSGTMP1
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +010071 paddd 1*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010072 sha256rnds2 STATE0, STATE1
73 shuf128_32 $0x0E, MSG, MSG
74 sha256rnds2 STATE1, STATE0
75 sha256msg1 MSGTMP1, MSGTMP0
76
77 /* Rounds 8-11 */
78 movu128 2*16(DATA_PTR), MSG
79 pshufb PSHUFFLE_BSWAP32_FLIP_MASK, MSG
80 mova128 MSG, MSGTMP2
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +010081 paddd 2*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010082 sha256rnds2 STATE0, STATE1
83 shuf128_32 $0x0E, MSG, MSG
84 sha256rnds2 STATE1, STATE0
85 sha256msg1 MSGTMP2, MSGTMP1
86
87 /* Rounds 12-15 */
88 movu128 3*16(DATA_PTR), MSG
89 pshufb PSHUFFLE_BSWAP32_FLIP_MASK, MSG
90 mova128 MSG, MSGTMP3
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +010091 paddd 3*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010092 sha256rnds2 STATE0, STATE1
93 mova128 MSGTMP3, MSGTMP4
94 palignr $4, MSGTMP2, MSGTMP4
95 paddd MSGTMP4, MSGTMP0
96 sha256msg2 MSGTMP3, MSGTMP0
97 shuf128_32 $0x0E, MSG, MSG
98 sha256rnds2 STATE1, STATE0
99 sha256msg1 MSGTMP3, MSGTMP2
100
101 /* Rounds 16-19 */
102 mova128 MSGTMP0, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100103 paddd 4*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100104 sha256rnds2 STATE0, STATE1
105 mova128 MSGTMP0, MSGTMP4
106 palignr $4, MSGTMP3, MSGTMP4
107 paddd MSGTMP4, MSGTMP1
108 sha256msg2 MSGTMP0, MSGTMP1
109 shuf128_32 $0x0E, MSG, MSG
110 sha256rnds2 STATE1, STATE0
111 sha256msg1 MSGTMP0, MSGTMP3
112
113 /* Rounds 20-23 */
114 mova128 MSGTMP1, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100115 paddd 5*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100116 sha256rnds2 STATE0, STATE1
117 mova128 MSGTMP1, MSGTMP4
118 palignr $4, MSGTMP0, MSGTMP4
119 paddd MSGTMP4, MSGTMP2
120 sha256msg2 MSGTMP1, MSGTMP2
121 shuf128_32 $0x0E, MSG, MSG
122 sha256rnds2 STATE1, STATE0
123 sha256msg1 MSGTMP1, MSGTMP0
124
125 /* Rounds 24-27 */
126 mova128 MSGTMP2, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100127 paddd 6*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100128 sha256rnds2 STATE0, STATE1
129 mova128 MSGTMP2, MSGTMP4
130 palignr $4, MSGTMP1, MSGTMP4
131 paddd MSGTMP4, MSGTMP3
132 sha256msg2 MSGTMP2, MSGTMP3
133 shuf128_32 $0x0E, MSG, MSG
134 sha256rnds2 STATE1, STATE0
135 sha256msg1 MSGTMP2, MSGTMP1
136
137 /* Rounds 28-31 */
138 mova128 MSGTMP3, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100139 paddd 7*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100140 sha256rnds2 STATE0, STATE1
141 mova128 MSGTMP3, MSGTMP4
142 palignr $4, MSGTMP2, MSGTMP4
143 paddd MSGTMP4, MSGTMP0
144 sha256msg2 MSGTMP3, MSGTMP0
145 shuf128_32 $0x0E, MSG, MSG
146 sha256rnds2 STATE1, STATE0
147 sha256msg1 MSGTMP3, MSGTMP2
148
149 /* Rounds 32-35 */
150 mova128 MSGTMP0, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100151 paddd 8*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100152 sha256rnds2 STATE0, STATE1
153 mova128 MSGTMP0, MSGTMP4
154 palignr $4, MSGTMP3, MSGTMP4
155 paddd MSGTMP4, MSGTMP1
156 sha256msg2 MSGTMP0, MSGTMP1
157 shuf128_32 $0x0E, MSG, MSG
158 sha256rnds2 STATE1, STATE0
159 sha256msg1 MSGTMP0, MSGTMP3
160
161 /* Rounds 36-39 */
162 mova128 MSGTMP1, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100163 paddd 9*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100164 sha256rnds2 STATE0, STATE1
165 mova128 MSGTMP1, MSGTMP4
166 palignr $4, MSGTMP0, MSGTMP4
167 paddd MSGTMP4, MSGTMP2
168 sha256msg2 MSGTMP1, MSGTMP2
169 shuf128_32 $0x0E, MSG, MSG
170 sha256rnds2 STATE1, STATE0
171 sha256msg1 MSGTMP1, MSGTMP0
172
173 /* Rounds 40-43 */
174 mova128 MSGTMP2, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100175 paddd 10*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100176 sha256rnds2 STATE0, STATE1
177 mova128 MSGTMP2, MSGTMP4
178 palignr $4, MSGTMP1, MSGTMP4
179 paddd MSGTMP4, MSGTMP3
180 sha256msg2 MSGTMP2, MSGTMP3
181 shuf128_32 $0x0E, MSG, MSG
182 sha256rnds2 STATE1, STATE0
183 sha256msg1 MSGTMP2, MSGTMP1
184
185 /* Rounds 44-47 */
186 mova128 MSGTMP3, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100187 paddd 11*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100188 sha256rnds2 STATE0, STATE1
189 mova128 MSGTMP3, MSGTMP4
190 palignr $4, MSGTMP2, MSGTMP4
191 paddd MSGTMP4, MSGTMP0
192 sha256msg2 MSGTMP3, MSGTMP0
193 shuf128_32 $0x0E, MSG, MSG
194 sha256rnds2 STATE1, STATE0
195 sha256msg1 MSGTMP3, MSGTMP2
196
197 /* Rounds 48-51 */
198 mova128 MSGTMP0, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100199 paddd 12*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100200 sha256rnds2 STATE0, STATE1
201 mova128 MSGTMP0, MSGTMP4
202 palignr $4, MSGTMP3, MSGTMP4
203 paddd MSGTMP4, MSGTMP1
204 sha256msg2 MSGTMP0, MSGTMP1
205 shuf128_32 $0x0E, MSG, MSG
206 sha256rnds2 STATE1, STATE0
207 sha256msg1 MSGTMP0, MSGTMP3
208
209 /* Rounds 52-55 */
210 mova128 MSGTMP1, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100211 paddd 13*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100212 sha256rnds2 STATE0, STATE1
213 mova128 MSGTMP1, MSGTMP4
214 palignr $4, MSGTMP0, MSGTMP4
215 paddd MSGTMP4, MSGTMP2
216 sha256msg2 MSGTMP1, MSGTMP2
217 shuf128_32 $0x0E, MSG, MSG
218 sha256rnds2 STATE1, STATE0
219
220 /* Rounds 56-59 */
221 mova128 MSGTMP2, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100222 paddd 14*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100223 sha256rnds2 STATE0, STATE1
224 mova128 MSGTMP2, MSGTMP4
225 palignr $4, MSGTMP1, MSGTMP4
226 paddd MSGTMP4, MSGTMP3
227 sha256msg2 MSGTMP2, MSGTMP3
228 shuf128_32 $0x0E, MSG, MSG
229 sha256rnds2 STATE1, STATE0
230
231 /* Rounds 60-63 */
232 mova128 MSGTMP3, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100233 paddd 15*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100234 sha256rnds2 STATE0, STATE1
235 shuf128_32 $0x0E, MSG, MSG
236 sha256rnds2 STATE1, STATE0
237
238 /* Add current hash values with previously saved */
239 paddd 0*16(%esp), STATE0
240 paddd 1*16(%esp), STATE1
241
242 /* Write hash values back in the correct order */
243 shuf128_32 $0x1B, STATE0, STATE0 /* FEBA */
244 shuf128_32 $0xB1, STATE1, STATE1 /* DCHG */
245 mova128 STATE0, MSGTMP4
246 pblendw $0xF0, STATE1, STATE0 /* DCBA */
247 palignr $8, MSGTMP4, STATE1 /* HGFE */
248
249 movu128 STATE0, 76+0*16(%eax)
250 movu128 STATE1, 76+1*16(%eax)
251
252 movl %ebp, %esp
253 popl %ebp
254 ret
255 .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
256
257.section .rodata.cst256.K256, "aM", @progbits, 256
258.balign 16
259K256:
260 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
261 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
262 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
263 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
264 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
265 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
266 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
267 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
268 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
269 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
270 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
271 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
272 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
273 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
274 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
275 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
276
277.section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
278.balign 16
279PSHUFFLE_BSWAP32_FLIP_MASK:
280 .octa 0x0c0d0e0f08090a0b0405060700010203
281
282#endif