blob: 5ed80c2ef7c564dbd71da0073fa8827258fde71b [file] [log] [blame]
Denys Vlasenko6472ac92022-02-03 14:15:20 +01001#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
2/* The code is adapted from Linux kernel's source */
3
4// We use shorter insns, even though they are for "wrong"
5// data type (fp, not int).
6// For Intel, there is no penalty for doing it at all
Denys Vlasenko6f56fa12022-02-10 15:38:10 +01007// (CPUs which do have such penalty do not support SHA insns).
Denys Vlasenko6472ac92022-02-03 14:15:20 +01008// For AMD, the penalty is one extra cycle
9// (allegedly: I failed to find measurable difference).
10
11//#define mova128 movdqa
12#define mova128 movaps
13//#define movu128 movdqu
14#define movu128 movups
15//#define shuf128_32 pshufd
16#define shuf128_32 shufps
17
18 .section .text.sha256_process_block64_shaNI, "ax", @progbits
19 .globl sha256_process_block64_shaNI
20 .hidden sha256_process_block64_shaNI
21 .type sha256_process_block64_shaNI, @function
22
23#define DATA_PTR %rdi
24
25#define SHA256CONSTANTS %rax
26
27#define MSG %xmm0
28#define STATE0 %xmm1
29#define STATE1 %xmm2
30#define MSGTMP0 %xmm3
31#define MSGTMP1 %xmm4
32#define MSGTMP2 %xmm5
33#define MSGTMP3 %xmm6
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010034
35#define XMMTMP %xmm7
Denys Vlasenko6472ac92022-02-03 14:15:20 +010036
37#define ABEF_SAVE %xmm9
38#define CDGH_SAVE %xmm10
39
Denys Vlasenko11bcea72022-02-09 01:42:49 +010040#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
41
Denys Vlasenko6472ac92022-02-03 14:15:20 +010042 .balign 8 # allow decoders to fetch at least 2 first insns
43sha256_process_block64_shaNI:
Denys Vlasenko6472ac92022-02-03 14:15:20 +010044
Denys Vlasenko6f56fa12022-02-10 15:38:10 +010045 movu128 80+0*16(%rdi), XMMTMP /* ABCD (little-endian dword order) */
46 movu128 80+1*16(%rdi), STATE1 /* EFGH */
Denys Vlasenko11bcea72022-02-09 01:42:49 +010047/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
Denys Vlasenkocaa9c4f2022-02-09 01:50:22 +010048 mova128 STATE1, STATE0
Denys Vlasenko6f56fa12022-02-10 15:38:10 +010049 /* --- -------------- ABCD -- EFGH */
50 shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */
51 shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */
Denys Vlasenko6472ac92022-02-03 14:15:20 +010052
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010053/* XMMTMP holds flip mask from here... */
54 mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +010055 leaq K256+8*16(%rip), SHA256CONSTANTS
Denys Vlasenko6472ac92022-02-03 14:15:20 +010056
57 /* Save hash values for addition after rounds */
58 mova128 STATE0, ABEF_SAVE
59 mova128 STATE1, CDGH_SAVE
60
61 /* Rounds 0-3 */
62 movu128 0*16(DATA_PTR), MSG
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010063 pshufb XMMTMP, MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010064 mova128 MSG, MSGTMP0
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +010065 paddd 0*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010066 sha256rnds2 STATE0, STATE1
67 shuf128_32 $0x0E, MSG, MSG
68 sha256rnds2 STATE1, STATE0
69
70 /* Rounds 4-7 */
71 movu128 1*16(DATA_PTR), MSG
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010072 pshufb XMMTMP, MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010073 mova128 MSG, MSGTMP1
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +010074 paddd 1*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010075 sha256rnds2 STATE0, STATE1
76 shuf128_32 $0x0E, MSG, MSG
77 sha256rnds2 STATE1, STATE0
78 sha256msg1 MSGTMP1, MSGTMP0
79
80 /* Rounds 8-11 */
81 movu128 2*16(DATA_PTR), MSG
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010082 pshufb XMMTMP, MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010083 mova128 MSG, MSGTMP2
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +010084 paddd 2*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010085 sha256rnds2 STATE0, STATE1
86 shuf128_32 $0x0E, MSG, MSG
87 sha256rnds2 STATE1, STATE0
88 sha256msg1 MSGTMP2, MSGTMP1
89
90 /* Rounds 12-15 */
91 movu128 3*16(DATA_PTR), MSG
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010092 pshufb XMMTMP, MSG
Denys Vlasenko31c1c312022-02-06 00:30:03 +010093/* ...to here */
Denys Vlasenko6472ac92022-02-03 14:15:20 +010094 mova128 MSG, MSGTMP3
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +010095 paddd 3*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010096 sha256rnds2 STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010097 mova128 MSGTMP3, XMMTMP
98 palignr $4, MSGTMP2, XMMTMP
99 paddd XMMTMP, MSGTMP0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100100 sha256msg2 MSGTMP3, MSGTMP0
101 shuf128_32 $0x0E, MSG, MSG
102 sha256rnds2 STATE1, STATE0
103 sha256msg1 MSGTMP3, MSGTMP2
104
105 /* Rounds 16-19 */
106 mova128 MSGTMP0, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100107 paddd 4*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100108 sha256rnds2 STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100109 mova128 MSGTMP0, XMMTMP
110 palignr $4, MSGTMP3, XMMTMP
111 paddd XMMTMP, MSGTMP1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100112 sha256msg2 MSGTMP0, MSGTMP1
113 shuf128_32 $0x0E, MSG, MSG
114 sha256rnds2 STATE1, STATE0
115 sha256msg1 MSGTMP0, MSGTMP3
116
117 /* Rounds 20-23 */
118 mova128 MSGTMP1, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100119 paddd 5*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100120 sha256rnds2 STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100121 mova128 MSGTMP1, XMMTMP
122 palignr $4, MSGTMP0, XMMTMP
123 paddd XMMTMP, MSGTMP2
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100124 sha256msg2 MSGTMP1, MSGTMP2
125 shuf128_32 $0x0E, MSG, MSG
126 sha256rnds2 STATE1, STATE0
127 sha256msg1 MSGTMP1, MSGTMP0
128
129 /* Rounds 24-27 */
130 mova128 MSGTMP2, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100131 paddd 6*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100132 sha256rnds2 STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100133 mova128 MSGTMP2, XMMTMP
134 palignr $4, MSGTMP1, XMMTMP
135 paddd XMMTMP, MSGTMP3
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100136 sha256msg2 MSGTMP2, MSGTMP3
137 shuf128_32 $0x0E, MSG, MSG
138 sha256rnds2 STATE1, STATE0
139 sha256msg1 MSGTMP2, MSGTMP1
140
141 /* Rounds 28-31 */
142 mova128 MSGTMP3, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100143 paddd 7*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100144 sha256rnds2 STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100145 mova128 MSGTMP3, XMMTMP
146 palignr $4, MSGTMP2, XMMTMP
147 paddd XMMTMP, MSGTMP0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100148 sha256msg2 MSGTMP3, MSGTMP0
149 shuf128_32 $0x0E, MSG, MSG
150 sha256rnds2 STATE1, STATE0
151 sha256msg1 MSGTMP3, MSGTMP2
152
153 /* Rounds 32-35 */
154 mova128 MSGTMP0, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100155 paddd 8*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100156 sha256rnds2 STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100157 mova128 MSGTMP0, XMMTMP
158 palignr $4, MSGTMP3, XMMTMP
159 paddd XMMTMP, MSGTMP1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100160 sha256msg2 MSGTMP0, MSGTMP1
161 shuf128_32 $0x0E, MSG, MSG
162 sha256rnds2 STATE1, STATE0
163 sha256msg1 MSGTMP0, MSGTMP3
164
165 /* Rounds 36-39 */
166 mova128 MSGTMP1, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100167 paddd 9*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100168 sha256rnds2 STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100169 mova128 MSGTMP1, XMMTMP
170 palignr $4, MSGTMP0, XMMTMP
171 paddd XMMTMP, MSGTMP2
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100172 sha256msg2 MSGTMP1, MSGTMP2
173 shuf128_32 $0x0E, MSG, MSG
174 sha256rnds2 STATE1, STATE0
175 sha256msg1 MSGTMP1, MSGTMP0
176
177 /* Rounds 40-43 */
178 mova128 MSGTMP2, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100179 paddd 10*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100180 sha256rnds2 STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100181 mova128 MSGTMP2, XMMTMP
182 palignr $4, MSGTMP1, XMMTMP
183 paddd XMMTMP, MSGTMP3
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100184 sha256msg2 MSGTMP2, MSGTMP3
185 shuf128_32 $0x0E, MSG, MSG
186 sha256rnds2 STATE1, STATE0
187 sha256msg1 MSGTMP2, MSGTMP1
188
189 /* Rounds 44-47 */
190 mova128 MSGTMP3, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100191 paddd 11*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100192 sha256rnds2 STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100193 mova128 MSGTMP3, XMMTMP
194 palignr $4, MSGTMP2, XMMTMP
195 paddd XMMTMP, MSGTMP0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100196 sha256msg2 MSGTMP3, MSGTMP0
197 shuf128_32 $0x0E, MSG, MSG
198 sha256rnds2 STATE1, STATE0
199 sha256msg1 MSGTMP3, MSGTMP2
200
201 /* Rounds 48-51 */
202 mova128 MSGTMP0, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100203 paddd 12*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100204 sha256rnds2 STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100205 mova128 MSGTMP0, XMMTMP
206 palignr $4, MSGTMP3, XMMTMP
207 paddd XMMTMP, MSGTMP1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100208 sha256msg2 MSGTMP0, MSGTMP1
209 shuf128_32 $0x0E, MSG, MSG
210 sha256rnds2 STATE1, STATE0
211 sha256msg1 MSGTMP0, MSGTMP3
212
213 /* Rounds 52-55 */
214 mova128 MSGTMP1, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100215 paddd 13*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100216 sha256rnds2 STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100217 mova128 MSGTMP1, XMMTMP
218 palignr $4, MSGTMP0, XMMTMP
219 paddd XMMTMP, MSGTMP2
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100220 sha256msg2 MSGTMP1, MSGTMP2
221 shuf128_32 $0x0E, MSG, MSG
222 sha256rnds2 STATE1, STATE0
223
224 /* Rounds 56-59 */
225 mova128 MSGTMP2, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100226 paddd 14*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100227 sha256rnds2 STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100228 mova128 MSGTMP2, XMMTMP
229 palignr $4, MSGTMP1, XMMTMP
230 paddd XMMTMP, MSGTMP3
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100231 sha256msg2 MSGTMP2, MSGTMP3
232 shuf128_32 $0x0E, MSG, MSG
233 sha256rnds2 STATE1, STATE0
234
235 /* Rounds 60-63 */
236 mova128 MSGTMP3, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100237 paddd 15*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100238 sha256rnds2 STATE0, STATE1
239 shuf128_32 $0x0E, MSG, MSG
240 sha256rnds2 STATE1, STATE0
241
242 /* Add current hash values with previously saved */
243 paddd ABEF_SAVE, STATE0
244 paddd CDGH_SAVE, STATE1
245
246 /* Write hash values back in the correct order */
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100247 mova128 STATE0, XMMTMP
Denys Vlasenko11bcea72022-02-09 01:42:49 +0100248/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
Denys Vlasenko6f56fa12022-02-10 15:38:10 +0100249 /* --- -------------- HGDC -- FEBA */
250 shufps SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */
251 shufps SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100252 movu128 STATE0, 80+0*16(%rdi)
Denys Vlasenko11bcea72022-02-09 01:42:49 +0100253 movu128 XMMTMP, 80+1*16(%rdi)
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100254
255 ret
256 .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
257
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100258 .section .rodata.cst256.K256, "aM", @progbits, 256
Denys Vlasenko6a6c1c02022-02-09 11:29:23 +0100259 .balign 16
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100260K256:
261 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
262 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
263 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
264 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
265 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
266 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
267 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
268 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
269 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
270 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
271 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
272 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
273 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
274 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
275 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
276 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
277
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100278 .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
Denys Vlasenko6a6c1c02022-02-09 11:29:23 +0100279 .balign 16
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100280PSHUFFLE_BSWAP32_FLIP_MASK:
Denys Vlasenko6a6c1c02022-02-09 11:29:23 +0100281 .octa 0x0c0d0e0f08090a0b0405060700010203
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100282
283#endif