blob: 172c2eae260ee092ffb3f3e8193c6613928d3ee5 [file] [log] [blame]
Denys Vlasenko6472ac92022-02-03 14:15:20 +01001#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
2/* The code is adapted from Linux kernel's source */
3
4// We use shorter insns, even though they are for "wrong"
5// data type (fp, not int).
6// For Intel, there is no penalty for doing it at all
Denys Vlasenko6f56fa12022-02-10 15:38:10 +01007// (CPUs which do have such penalty do not support SHA insns).
Denys Vlasenko6472ac92022-02-03 14:15:20 +01008// For AMD, the penalty is one extra cycle
9// (allegedly: I failed to find measurable difference).
10
11//#define mova128 movdqa
12#define mova128 movaps
13//#define movu128 movdqu
14#define movu128 movups
15//#define shuf128_32 pshufd
16#define shuf128_32 shufps
17
Denys Vlasenkodda77e82022-02-11 14:53:26 +010018// pshufb and palignr are SSSE3 insns.
19// We do not check SSSE3 in cpuid,
20// all SHA-capable CPUs support it as well.
21
Ludwig Nussel526625b2022-08-25 12:48:07 +020022#ifdef __linux__
23 .section .note.GNU-stack, "", @progbits
24#endif
Denys Vlasenko6472ac92022-02-03 14:15:20 +010025 .section .text.sha256_process_block64_shaNI, "ax", @progbits
26 .globl sha256_process_block64_shaNI
27 .hidden sha256_process_block64_shaNI
28 .type sha256_process_block64_shaNI, @function
29
30#define DATA_PTR %rdi
31
32#define SHA256CONSTANTS %rax
33
34#define MSG %xmm0
35#define STATE0 %xmm1
36#define STATE1 %xmm2
37#define MSGTMP0 %xmm3
38#define MSGTMP1 %xmm4
39#define MSGTMP2 %xmm5
40#define MSGTMP3 %xmm6
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010041
42#define XMMTMP %xmm7
Denys Vlasenko6472ac92022-02-03 14:15:20 +010043
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010044#define SAVE0 %xmm8
45#define SAVE1 %xmm9
Denys Vlasenko6472ac92022-02-03 14:15:20 +010046
Denys Vlasenko11bcea72022-02-09 01:42:49 +010047#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
48
Denys Vlasenko6472ac92022-02-03 14:15:20 +010049 .balign 8 # allow decoders to fetch at least 2 first insns
50sha256_process_block64_shaNI:
Denys Vlasenko6472ac92022-02-03 14:15:20 +010051
Denys Vlasenko6f56fa12022-02-10 15:38:10 +010052 movu128 80+0*16(%rdi), XMMTMP /* ABCD (little-endian dword order) */
53 movu128 80+1*16(%rdi), STATE1 /* EFGH */
Denys Vlasenko11bcea72022-02-09 01:42:49 +010054/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
Denys Vlasenkocaa9c4f2022-02-09 01:50:22 +010055 mova128 STATE1, STATE0
Denys Vlasenko6f56fa12022-02-10 15:38:10 +010056 /* --- -------------- ABCD -- EFGH */
57 shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */
58 shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */
Denys Vlasenko6472ac92022-02-03 14:15:20 +010059
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010060/* XMMTMP holds flip mask from here... */
61 mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +010062 leaq K256+8*16(%rip), SHA256CONSTANTS
Denys Vlasenko6472ac92022-02-03 14:15:20 +010063
64 /* Save hash values for addition after rounds */
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010065 mova128 STATE0, SAVE0
66 mova128 STATE1, SAVE1
Denys Vlasenko6472ac92022-02-03 14:15:20 +010067
68 /* Rounds 0-3 */
69 movu128 0*16(DATA_PTR), MSG
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010070 pshufb XMMTMP, MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010071 mova128 MSG, MSGTMP0
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +010072 paddd 0*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010073 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenko6472ac92022-02-03 14:15:20 +010074 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010075 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +010076
77 /* Rounds 4-7 */
78 movu128 1*16(DATA_PTR), MSG
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010079 pshufb XMMTMP, MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010080 mova128 MSG, MSGTMP1
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +010081 paddd 1*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010082 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenko6472ac92022-02-03 14:15:20 +010083 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010084 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +010085 sha256msg1 MSGTMP1, MSGTMP0
86
87 /* Rounds 8-11 */
88 movu128 2*16(DATA_PTR), MSG
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010089 pshufb XMMTMP, MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010090 mova128 MSG, MSGTMP2
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +010091 paddd 2*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010092 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenko6472ac92022-02-03 14:15:20 +010093 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010094 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +010095 sha256msg1 MSGTMP2, MSGTMP1
96
97 /* Rounds 12-15 */
98 movu128 3*16(DATA_PTR), MSG
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010099 pshufb XMMTMP, MSG
Denys Vlasenko31c1c312022-02-06 00:30:03 +0100100/* ...to here */
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100101 mova128 MSG, MSGTMP3
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100102 paddd 3*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100103 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100104 mova128 MSGTMP3, XMMTMP
105 palignr $4, MSGTMP2, XMMTMP
106 paddd XMMTMP, MSGTMP0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100107 sha256msg2 MSGTMP3, MSGTMP0
108 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100109 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100110 sha256msg1 MSGTMP3, MSGTMP2
111
112 /* Rounds 16-19 */
113 mova128 MSGTMP0, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100114 paddd 4*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100115 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100116 mova128 MSGTMP0, XMMTMP
117 palignr $4, MSGTMP3, XMMTMP
118 paddd XMMTMP, MSGTMP1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100119 sha256msg2 MSGTMP0, MSGTMP1
120 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100121 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100122 sha256msg1 MSGTMP0, MSGTMP3
123
124 /* Rounds 20-23 */
125 mova128 MSGTMP1, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100126 paddd 5*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100127 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100128 mova128 MSGTMP1, XMMTMP
129 palignr $4, MSGTMP0, XMMTMP
130 paddd XMMTMP, MSGTMP2
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100131 sha256msg2 MSGTMP1, MSGTMP2
132 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100133 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100134 sha256msg1 MSGTMP1, MSGTMP0
135
136 /* Rounds 24-27 */
137 mova128 MSGTMP2, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100138 paddd 6*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100139 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100140 mova128 MSGTMP2, XMMTMP
141 palignr $4, MSGTMP1, XMMTMP
142 paddd XMMTMP, MSGTMP3
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100143 sha256msg2 MSGTMP2, MSGTMP3
144 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100145 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100146 sha256msg1 MSGTMP2, MSGTMP1
147
148 /* Rounds 28-31 */
149 mova128 MSGTMP3, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100150 paddd 7*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100151 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100152 mova128 MSGTMP3, XMMTMP
153 palignr $4, MSGTMP2, XMMTMP
154 paddd XMMTMP, MSGTMP0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100155 sha256msg2 MSGTMP3, MSGTMP0
156 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100157 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100158 sha256msg1 MSGTMP3, MSGTMP2
159
160 /* Rounds 32-35 */
161 mova128 MSGTMP0, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100162 paddd 8*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100163 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100164 mova128 MSGTMP0, XMMTMP
165 palignr $4, MSGTMP3, XMMTMP
166 paddd XMMTMP, MSGTMP1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100167 sha256msg2 MSGTMP0, MSGTMP1
168 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100169 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100170 sha256msg1 MSGTMP0, MSGTMP3
171
172 /* Rounds 36-39 */
173 mova128 MSGTMP1, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100174 paddd 9*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100175 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100176 mova128 MSGTMP1, XMMTMP
177 palignr $4, MSGTMP0, XMMTMP
178 paddd XMMTMP, MSGTMP2
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100179 sha256msg2 MSGTMP1, MSGTMP2
180 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100181 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100182 sha256msg1 MSGTMP1, MSGTMP0
183
184 /* Rounds 40-43 */
185 mova128 MSGTMP2, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100186 paddd 10*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100187 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100188 mova128 MSGTMP2, XMMTMP
189 palignr $4, MSGTMP1, XMMTMP
190 paddd XMMTMP, MSGTMP3
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100191 sha256msg2 MSGTMP2, MSGTMP3
192 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100193 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100194 sha256msg1 MSGTMP2, MSGTMP1
195
196 /* Rounds 44-47 */
197 mova128 MSGTMP3, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100198 paddd 11*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100199 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100200 mova128 MSGTMP3, XMMTMP
201 palignr $4, MSGTMP2, XMMTMP
202 paddd XMMTMP, MSGTMP0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100203 sha256msg2 MSGTMP3, MSGTMP0
204 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100205 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100206 sha256msg1 MSGTMP3, MSGTMP2
207
208 /* Rounds 48-51 */
209 mova128 MSGTMP0, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100210 paddd 12*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100211 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100212 mova128 MSGTMP0, XMMTMP
213 palignr $4, MSGTMP3, XMMTMP
214 paddd XMMTMP, MSGTMP1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100215 sha256msg2 MSGTMP0, MSGTMP1
216 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100217 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100218 sha256msg1 MSGTMP0, MSGTMP3
219
220 /* Rounds 52-55 */
221 mova128 MSGTMP1, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100222 paddd 13*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100223 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100224 mova128 MSGTMP1, XMMTMP
225 palignr $4, MSGTMP0, XMMTMP
226 paddd XMMTMP, MSGTMP2
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100227 sha256msg2 MSGTMP1, MSGTMP2
228 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100229 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100230
231 /* Rounds 56-59 */
232 mova128 MSGTMP2, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100233 paddd 14*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100234 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100235 mova128 MSGTMP2, XMMTMP
236 palignr $4, MSGTMP1, XMMTMP
237 paddd XMMTMP, MSGTMP3
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100238 sha256msg2 MSGTMP2, MSGTMP3
239 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100240 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100241
242 /* Rounds 60-63 */
243 mova128 MSGTMP3, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100244 paddd 15*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100245 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100246 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100247 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100248
249 /* Add current hash values with previously saved */
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100250 paddd SAVE0, STATE0
251 paddd SAVE1, STATE1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100252
253 /* Write hash values back in the correct order */
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100254 mova128 STATE0, XMMTMP
Denys Vlasenko11bcea72022-02-09 01:42:49 +0100255/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
Denys Vlasenko6f56fa12022-02-10 15:38:10 +0100256 /* --- -------------- HGDC -- FEBA */
257 shufps SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */
258 shufps SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100259 movu128 STATE0, 80+0*16(%rdi)
Denys Vlasenko11bcea72022-02-09 01:42:49 +0100260 movu128 XMMTMP, 80+1*16(%rdi)
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100261
262 ret
263 .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
264
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100265 .section .rodata.cst256.K256, "aM", @progbits, 256
Denys Vlasenko6a6c1c02022-02-09 11:29:23 +0100266 .balign 16
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100267K256:
268 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
269 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
270 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
271 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
272 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
273 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
274 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
275 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
276 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
277 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
278 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
279 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
280 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
281 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
282 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
283 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
284
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100285 .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
Denys Vlasenko6a6c1c02022-02-09 11:29:23 +0100286 .balign 16
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100287PSHUFFLE_BSWAP32_FLIP_MASK:
Denys Vlasenko6a6c1c02022-02-09 11:29:23 +0100288 .octa 0x0c0d0e0f08090a0b0405060700010203
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100289
290#endif