blob: 082ceafe4e0db88f99ff559b965f381df283ce53 [file] [log] [blame]
Denys Vlasenko6472ac92022-02-03 14:15:20 +01001#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
2/* The code is adapted from Linux kernel's source */
3
4// We use shorter insns, even though they are for "wrong"
5// data type (fp, not int).
6// For Intel, there is no penalty for doing it at all
Denys Vlasenko6f56fa12022-02-10 15:38:10 +01007// (CPUs which do have such penalty do not support SHA insns).
Denys Vlasenko6472ac92022-02-03 14:15:20 +01008// For AMD, the penalty is one extra cycle
9// (allegedly: I failed to find measurable difference).
10
11//#define mova128 movdqa
12#define mova128 movaps
13//#define movu128 movdqu
14#define movu128 movups
15//#define shuf128_32 pshufd
16#define shuf128_32 shufps
17
Denys Vlasenkodda77e82022-02-11 14:53:26 +010018// pshufb and palignr are SSSE3 insns.
19// We do not check SSSE3 in cpuid,
20// all SHA-capable CPUs support it as well.
21
Denys Vlasenko6472ac92022-02-03 14:15:20 +010022 .section .text.sha256_process_block64_shaNI, "ax", @progbits
23 .globl sha256_process_block64_shaNI
24 .hidden sha256_process_block64_shaNI
25 .type sha256_process_block64_shaNI, @function
26
27#define DATA_PTR %rdi
28
29#define SHA256CONSTANTS %rax
30
31#define MSG %xmm0
32#define STATE0 %xmm1
33#define STATE1 %xmm2
34#define MSGTMP0 %xmm3
35#define MSGTMP1 %xmm4
36#define MSGTMP2 %xmm5
37#define MSGTMP3 %xmm6
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010038
39#define XMMTMP %xmm7
Denys Vlasenko6472ac92022-02-03 14:15:20 +010040
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010041#define SAVE0 %xmm8
42#define SAVE1 %xmm9
Denys Vlasenko6472ac92022-02-03 14:15:20 +010043
Denys Vlasenko11bcea72022-02-09 01:42:49 +010044#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
45
Denys Vlasenko6472ac92022-02-03 14:15:20 +010046 .balign 8 # allow decoders to fetch at least 2 first insns
47sha256_process_block64_shaNI:
Denys Vlasenko6472ac92022-02-03 14:15:20 +010048
Denys Vlasenko6f56fa12022-02-10 15:38:10 +010049 movu128 80+0*16(%rdi), XMMTMP /* ABCD (little-endian dword order) */
50 movu128 80+1*16(%rdi), STATE1 /* EFGH */
Denys Vlasenko11bcea72022-02-09 01:42:49 +010051/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
Denys Vlasenkocaa9c4f2022-02-09 01:50:22 +010052 mova128 STATE1, STATE0
Denys Vlasenko6f56fa12022-02-10 15:38:10 +010053 /* --- -------------- ABCD -- EFGH */
54 shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */
55 shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */
Denys Vlasenko6472ac92022-02-03 14:15:20 +010056
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010057/* XMMTMP holds flip mask from here... */
58 mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +010059 leaq K256+8*16(%rip), SHA256CONSTANTS
Denys Vlasenko6472ac92022-02-03 14:15:20 +010060
61 /* Save hash values for addition after rounds */
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010062 mova128 STATE0, SAVE0
63 mova128 STATE1, SAVE1
Denys Vlasenko6472ac92022-02-03 14:15:20 +010064
65 /* Rounds 0-3 */
66 movu128 0*16(DATA_PTR), MSG
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010067 pshufb XMMTMP, MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010068 mova128 MSG, MSGTMP0
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +010069 paddd 0*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010070 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenko6472ac92022-02-03 14:15:20 +010071 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010072 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +010073
74 /* Rounds 4-7 */
75 movu128 1*16(DATA_PTR), MSG
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010076 pshufb XMMTMP, MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010077 mova128 MSG, MSGTMP1
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +010078 paddd 1*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010079 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenko6472ac92022-02-03 14:15:20 +010080 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010081 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +010082 sha256msg1 MSGTMP1, MSGTMP0
83
84 /* Rounds 8-11 */
85 movu128 2*16(DATA_PTR), MSG
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010086 pshufb XMMTMP, MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010087 mova128 MSG, MSGTMP2
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +010088 paddd 2*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010089 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenko6472ac92022-02-03 14:15:20 +010090 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010091 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +010092 sha256msg1 MSGTMP2, MSGTMP1
93
94 /* Rounds 12-15 */
95 movu128 3*16(DATA_PTR), MSG
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010096 pshufb XMMTMP, MSG
Denys Vlasenko31c1c312022-02-06 00:30:03 +010097/* ...to here */
Denys Vlasenko6472ac92022-02-03 14:15:20 +010098 mova128 MSG, MSGTMP3
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +010099 paddd 3*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100100 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100101 mova128 MSGTMP3, XMMTMP
102 palignr $4, MSGTMP2, XMMTMP
103 paddd XMMTMP, MSGTMP0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100104 sha256msg2 MSGTMP3, MSGTMP0
105 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100106 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100107 sha256msg1 MSGTMP3, MSGTMP2
108
109 /* Rounds 16-19 */
110 mova128 MSGTMP0, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100111 paddd 4*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100112 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100113 mova128 MSGTMP0, XMMTMP
114 palignr $4, MSGTMP3, XMMTMP
115 paddd XMMTMP, MSGTMP1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100116 sha256msg2 MSGTMP0, MSGTMP1
117 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100118 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100119 sha256msg1 MSGTMP0, MSGTMP3
120
121 /* Rounds 20-23 */
122 mova128 MSGTMP1, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100123 paddd 5*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100124 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100125 mova128 MSGTMP1, XMMTMP
126 palignr $4, MSGTMP0, XMMTMP
127 paddd XMMTMP, MSGTMP2
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100128 sha256msg2 MSGTMP1, MSGTMP2
129 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100130 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100131 sha256msg1 MSGTMP1, MSGTMP0
132
133 /* Rounds 24-27 */
134 mova128 MSGTMP2, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100135 paddd 6*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100136 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100137 mova128 MSGTMP2, XMMTMP
138 palignr $4, MSGTMP1, XMMTMP
139 paddd XMMTMP, MSGTMP3
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100140 sha256msg2 MSGTMP2, MSGTMP3
141 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100142 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100143 sha256msg1 MSGTMP2, MSGTMP1
144
145 /* Rounds 28-31 */
146 mova128 MSGTMP3, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100147 paddd 7*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100148 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100149 mova128 MSGTMP3, XMMTMP
150 palignr $4, MSGTMP2, XMMTMP
151 paddd XMMTMP, MSGTMP0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100152 sha256msg2 MSGTMP3, MSGTMP0
153 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100154 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100155 sha256msg1 MSGTMP3, MSGTMP2
156
157 /* Rounds 32-35 */
158 mova128 MSGTMP0, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100159 paddd 8*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100160 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100161 mova128 MSGTMP0, XMMTMP
162 palignr $4, MSGTMP3, XMMTMP
163 paddd XMMTMP, MSGTMP1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100164 sha256msg2 MSGTMP0, MSGTMP1
165 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100166 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100167 sha256msg1 MSGTMP0, MSGTMP3
168
169 /* Rounds 36-39 */
170 mova128 MSGTMP1, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100171 paddd 9*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100172 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100173 mova128 MSGTMP1, XMMTMP
174 palignr $4, MSGTMP0, XMMTMP
175 paddd XMMTMP, MSGTMP2
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100176 sha256msg2 MSGTMP1, MSGTMP2
177 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100178 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100179 sha256msg1 MSGTMP1, MSGTMP0
180
181 /* Rounds 40-43 */
182 mova128 MSGTMP2, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100183 paddd 10*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100184 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100185 mova128 MSGTMP2, XMMTMP
186 palignr $4, MSGTMP1, XMMTMP
187 paddd XMMTMP, MSGTMP3
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100188 sha256msg2 MSGTMP2, MSGTMP3
189 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100190 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100191 sha256msg1 MSGTMP2, MSGTMP1
192
193 /* Rounds 44-47 */
194 mova128 MSGTMP3, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100195 paddd 11*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100196 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100197 mova128 MSGTMP3, XMMTMP
198 palignr $4, MSGTMP2, XMMTMP
199 paddd XMMTMP, MSGTMP0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100200 sha256msg2 MSGTMP3, MSGTMP0
201 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100202 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100203 sha256msg1 MSGTMP3, MSGTMP2
204
205 /* Rounds 48-51 */
206 mova128 MSGTMP0, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100207 paddd 12*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100208 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100209 mova128 MSGTMP0, XMMTMP
210 palignr $4, MSGTMP3, XMMTMP
211 paddd XMMTMP, MSGTMP1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100212 sha256msg2 MSGTMP0, MSGTMP1
213 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100214 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100215 sha256msg1 MSGTMP0, MSGTMP3
216
217 /* Rounds 52-55 */
218 mova128 MSGTMP1, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100219 paddd 13*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100220 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100221 mova128 MSGTMP1, XMMTMP
222 palignr $4, MSGTMP0, XMMTMP
223 paddd XMMTMP, MSGTMP2
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100224 sha256msg2 MSGTMP1, MSGTMP2
225 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100226 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100227
228 /* Rounds 56-59 */
229 mova128 MSGTMP2, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100230 paddd 14*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100231 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100232 mova128 MSGTMP2, XMMTMP
233 palignr $4, MSGTMP1, XMMTMP
234 paddd XMMTMP, MSGTMP3
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100235 sha256msg2 MSGTMP2, MSGTMP3
236 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100237 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100238
239 /* Rounds 60-63 */
240 mova128 MSGTMP3, MSG
Denys Vlasenkoa1429fb2022-02-03 15:17:42 +0100241 paddd 15*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100242 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100243 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100244 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100245
246 /* Add current hash values with previously saved */
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100247 paddd SAVE0, STATE0
248 paddd SAVE1, STATE1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100249
250 /* Write hash values back in the correct order */
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100251 mova128 STATE0, XMMTMP
Denys Vlasenko11bcea72022-02-09 01:42:49 +0100252/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
Denys Vlasenko6f56fa12022-02-10 15:38:10 +0100253 /* --- -------------- HGDC -- FEBA */
254 shufps SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */
255 shufps SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100256 movu128 STATE0, 80+0*16(%rdi)
Denys Vlasenko11bcea72022-02-09 01:42:49 +0100257 movu128 XMMTMP, 80+1*16(%rdi)
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100258
259 ret
260 .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
261
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100262 .section .rodata.cst256.K256, "aM", @progbits, 256
Denys Vlasenko6a6c1c02022-02-09 11:29:23 +0100263 .balign 16
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100264K256:
265 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
266 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
267 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
268 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
269 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
270 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
271 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
272 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
273 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
274 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
275 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
276 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
277 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
278 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
279 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
280 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
281
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100282 .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
Denys Vlasenko6a6c1c02022-02-09 11:29:23 +0100283 .balign 16
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100284PSHUFFLE_BSWAP32_FLIP_MASK:
Denys Vlasenko6a6c1c02022-02-09 11:29:23 +0100285 .octa 0x0c0d0e0f08090a0b0405060700010203
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100286
287#endif