blob: 3905bad9add56ab059ef738ae913da889e407bc7 [file] [log] [blame]
Denys Vlasenko6472ac92022-02-03 14:15:20 +01001#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__)
2/* The code is adapted from Linux kernel's source */
3
4// We use shorter insns, even though they are for "wrong"
5// data type (fp, not int).
6// For Intel, there is no penalty for doing it at all
Denys Vlasenko6f56fa12022-02-10 15:38:10 +01007// (CPUs which do have such penalty do not support SHA insns).
Denys Vlasenko6472ac92022-02-03 14:15:20 +01008// For AMD, the penalty is one extra cycle
9// (allegedly: I failed to find measurable difference).
10
11//#define mova128 movdqa
12#define mova128 movaps
13//#define movu128 movdqu
14#define movu128 movups
15//#define shuf128_32 pshufd
16#define shuf128_32 shufps
17
Denys Vlasenkodda77e82022-02-11 14:53:26 +010018// pshufb and palignr are SSSE3 insns.
19// We do not check SSSE3 in cpuid,
20// all SHA-capable CPUs support it as well.
21
Denys Vlasenko6472ac92022-02-03 14:15:20 +010022 .section .text.sha256_process_block64_shaNI, "ax", @progbits
23 .globl sha256_process_block64_shaNI
24 .hidden sha256_process_block64_shaNI
25 .type sha256_process_block64_shaNI, @function
26
27#define DATA_PTR %eax
28
29#define SHA256CONSTANTS %ecx
30
31#define MSG %xmm0
32#define STATE0 %xmm1
33#define STATE1 %xmm2
34#define MSGTMP0 %xmm3
35#define MSGTMP1 %xmm4
36#define MSGTMP2 %xmm5
37#define MSGTMP3 %xmm6
Denys Vlasenko6472ac92022-02-03 14:15:20 +010038
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010039#define XMMTMP %xmm7
40
Denys Vlasenko461a9942022-02-09 01:30:23 +010041#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
42
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010043 .balign 8 # allow decoders to fetch at least 2 first insns
Denys Vlasenko6472ac92022-02-03 14:15:20 +010044sha256_process_block64_shaNI:
Denys Vlasenko6472ac92022-02-03 14:15:20 +010045
Denys Vlasenko6f56fa12022-02-10 15:38:10 +010046 movu128 76+0*16(%eax), XMMTMP /* ABCD (little-endian dword order) */
47 movu128 76+1*16(%eax), STATE1 /* EFGH */
Denys Vlasenko461a9942022-02-09 01:30:23 +010048/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
Denys Vlasenkocaa9c4f2022-02-09 01:50:22 +010049 mova128 STATE1, STATE0
Denys Vlasenko6f56fa12022-02-10 15:38:10 +010050 /* --- -------------- ABCD -- EFGH */
51 shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */
52 shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */
Denys Vlasenko6472ac92022-02-03 14:15:20 +010053
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010054/* XMMTMP holds flip mask from here... */
55 mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +010056 movl $K256+8*16, SHA256CONSTANTS
Denys Vlasenko6472ac92022-02-03 14:15:20 +010057
Denys Vlasenko6472ac92022-02-03 14:15:20 +010058 /* Rounds 0-3 */
59 movu128 0*16(DATA_PTR), MSG
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010060 pshufb XMMTMP, MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010061 mova128 MSG, MSGTMP0
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +010062 paddd 0*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010063 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenko6472ac92022-02-03 14:15:20 +010064 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010065 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +010066
67 /* Rounds 4-7 */
68 movu128 1*16(DATA_PTR), MSG
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010069 pshufb XMMTMP, MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010070 mova128 MSG, MSGTMP1
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +010071 paddd 1*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010072 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenko6472ac92022-02-03 14:15:20 +010073 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010074 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +010075 sha256msg1 MSGTMP1, MSGTMP0
76
77 /* Rounds 8-11 */
78 movu128 2*16(DATA_PTR), MSG
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010079 pshufb XMMTMP, MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010080 mova128 MSG, MSGTMP2
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +010081 paddd 2*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010082 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenko6472ac92022-02-03 14:15:20 +010083 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010084 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +010085 sha256msg1 MSGTMP2, MSGTMP1
86
87 /* Rounds 12-15 */
88 movu128 3*16(DATA_PTR), MSG
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010089 pshufb XMMTMP, MSG
Denys Vlasenko4f407352022-02-06 00:55:52 +010090/* ...to here */
Denys Vlasenko6472ac92022-02-03 14:15:20 +010091 mova128 MSG, MSGTMP3
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +010092 paddd 3*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010093 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010094 mova128 MSGTMP3, XMMTMP
95 palignr $4, MSGTMP2, XMMTMP
96 paddd XMMTMP, MSGTMP0
Denys Vlasenko6472ac92022-02-03 14:15:20 +010097 sha256msg2 MSGTMP3, MSGTMP0
98 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010099 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100100 sha256msg1 MSGTMP3, MSGTMP2
101
102 /* Rounds 16-19 */
103 mova128 MSGTMP0, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100104 paddd 4*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100105 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100106 mova128 MSGTMP0, XMMTMP
107 palignr $4, MSGTMP3, XMMTMP
108 paddd XMMTMP, MSGTMP1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100109 sha256msg2 MSGTMP0, MSGTMP1
110 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100111 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100112 sha256msg1 MSGTMP0, MSGTMP3
113
114 /* Rounds 20-23 */
115 mova128 MSGTMP1, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100116 paddd 5*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100117 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100118 mova128 MSGTMP1, XMMTMP
119 palignr $4, MSGTMP0, XMMTMP
120 paddd XMMTMP, MSGTMP2
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100121 sha256msg2 MSGTMP1, MSGTMP2
122 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100123 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100124 sha256msg1 MSGTMP1, MSGTMP0
125
126 /* Rounds 24-27 */
127 mova128 MSGTMP2, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100128 paddd 6*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100129 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100130 mova128 MSGTMP2, XMMTMP
131 palignr $4, MSGTMP1, XMMTMP
132 paddd XMMTMP, MSGTMP3
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100133 sha256msg2 MSGTMP2, MSGTMP3
134 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100135 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100136 sha256msg1 MSGTMP2, MSGTMP1
137
138 /* Rounds 28-31 */
139 mova128 MSGTMP3, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100140 paddd 7*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100141 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100142 mova128 MSGTMP3, XMMTMP
143 palignr $4, MSGTMP2, XMMTMP
144 paddd XMMTMP, MSGTMP0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100145 sha256msg2 MSGTMP3, MSGTMP0
146 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100147 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100148 sha256msg1 MSGTMP3, MSGTMP2
149
150 /* Rounds 32-35 */
151 mova128 MSGTMP0, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100152 paddd 8*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100153 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100154 mova128 MSGTMP0, XMMTMP
155 palignr $4, MSGTMP3, XMMTMP
156 paddd XMMTMP, MSGTMP1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100157 sha256msg2 MSGTMP0, MSGTMP1
158 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100159 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100160 sha256msg1 MSGTMP0, MSGTMP3
161
162 /* Rounds 36-39 */
163 mova128 MSGTMP1, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100164 paddd 9*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100165 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100166 mova128 MSGTMP1, XMMTMP
167 palignr $4, MSGTMP0, XMMTMP
168 paddd XMMTMP, MSGTMP2
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100169 sha256msg2 MSGTMP1, MSGTMP2
170 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100171 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100172 sha256msg1 MSGTMP1, MSGTMP0
173
174 /* Rounds 40-43 */
175 mova128 MSGTMP2, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100176 paddd 10*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100177 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100178 mova128 MSGTMP2, XMMTMP
179 palignr $4, MSGTMP1, XMMTMP
180 paddd XMMTMP, MSGTMP3
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100181 sha256msg2 MSGTMP2, MSGTMP3
182 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100183 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100184 sha256msg1 MSGTMP2, MSGTMP1
185
186 /* Rounds 44-47 */
187 mova128 MSGTMP3, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100188 paddd 11*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100189 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100190 mova128 MSGTMP3, XMMTMP
191 palignr $4, MSGTMP2, XMMTMP
192 paddd XMMTMP, MSGTMP0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100193 sha256msg2 MSGTMP3, MSGTMP0
194 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100195 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100196 sha256msg1 MSGTMP3, MSGTMP2
197
198 /* Rounds 48-51 */
199 mova128 MSGTMP0, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100200 paddd 12*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100201 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100202 mova128 MSGTMP0, XMMTMP
203 palignr $4, MSGTMP3, XMMTMP
204 paddd XMMTMP, MSGTMP1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100205 sha256msg2 MSGTMP0, MSGTMP1
206 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100207 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100208 sha256msg1 MSGTMP0, MSGTMP3
209
210 /* Rounds 52-55 */
211 mova128 MSGTMP1, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100212 paddd 13*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100213 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100214 mova128 MSGTMP1, XMMTMP
215 palignr $4, MSGTMP0, XMMTMP
216 paddd XMMTMP, MSGTMP2
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100217 sha256msg2 MSGTMP1, MSGTMP2
218 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100219 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100220
221 /* Rounds 56-59 */
222 mova128 MSGTMP2, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100223 paddd 14*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100224 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100225 mova128 MSGTMP2, XMMTMP
226 palignr $4, MSGTMP1, XMMTMP
227 paddd XMMTMP, MSGTMP3
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100228 sha256msg2 MSGTMP2, MSGTMP3
229 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100230 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100231
232 /* Rounds 60-63 */
233 mova128 MSGTMP3, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100234 paddd 15*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100235 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100236 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100237 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100238
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100239 /* Write hash values back in the correct order */
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100240 mova128 STATE0, XMMTMP
Denys Vlasenko461a9942022-02-09 01:30:23 +0100241/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
Denys Vlasenko6f56fa12022-02-10 15:38:10 +0100242 /* --- -------------- HGDC -- FEBA */
243 shufps SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */
244 shufps SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100245 /* add current hash values to previous ones */
Denys Vlasenko461a9942022-02-09 01:30:23 +0100246 movu128 76+1*16(%eax), STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100247 paddd XMMTMP, STATE1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100248 movu128 STATE1, 76+1*16(%eax)
Denys Vlasenko461a9942022-02-09 01:30:23 +0100249 movu128 76+0*16(%eax), XMMTMP
250 paddd XMMTMP, STATE0
251 movu128 STATE0, 76+0*16(%eax)
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100252
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100253 ret
254 .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
255
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100256 .section .rodata.cst256.K256, "aM", @progbits, 256
Denys Vlasenko6a6c1c02022-02-09 11:29:23 +0100257 .balign 16
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100258K256:
259 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
260 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
261 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
262 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
263 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
264 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
265 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
266 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
267 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
268 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
269 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
270 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
271 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
272 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
273 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
274 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
275
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100276 .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
Denys Vlasenko6a6c1c02022-02-09 11:29:23 +0100277 .balign 16
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100278PSHUFFLE_BSWAP32_FLIP_MASK:
Denys Vlasenko6a6c1c02022-02-09 11:29:23 +0100279 .octa 0x0c0d0e0f08090a0b0405060700010203
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100280
281#endif