blob: a0e4a571a01f8199078f1c015aaf62d2e872a2a4 [file] [log] [blame]
Denys Vlasenko6472ac92022-02-03 14:15:20 +01001#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__)
2/* The code is adapted from Linux kernel's source */
3
4// We use shorter insns, even though they are for "wrong"
5// data type (fp, not int).
6// For Intel, there is no penalty for doing it at all
Denys Vlasenko6f56fa12022-02-10 15:38:10 +01007// (CPUs which do have such penalty do not support SHA insns).
Denys Vlasenko6472ac92022-02-03 14:15:20 +01008// For AMD, the penalty is one extra cycle
9// (allegedly: I failed to find measurable difference).
10
11//#define mova128 movdqa
12#define mova128 movaps
13//#define movu128 movdqu
14#define movu128 movups
15//#define shuf128_32 pshufd
16#define shuf128_32 shufps
17
Denys Vlasenkodda77e82022-02-11 14:53:26 +010018// pshufb and palignr are SSSE3 insns.
19// We do not check SSSE3 in cpuid,
20// all SHA-capable CPUs support it as well.
21
Ludwig Nussel526625b2022-08-25 12:48:07 +020022#ifdef __linux__
23 .section .note.GNU-stack, "", @progbits
24#endif
Denys Vlasenko6472ac92022-02-03 14:15:20 +010025 .section .text.sha256_process_block64_shaNI, "ax", @progbits
26 .globl sha256_process_block64_shaNI
27 .hidden sha256_process_block64_shaNI
28 .type sha256_process_block64_shaNI, @function
29
30#define DATA_PTR %eax
31
32#define SHA256CONSTANTS %ecx
33
34#define MSG %xmm0
35#define STATE0 %xmm1
36#define STATE1 %xmm2
37#define MSGTMP0 %xmm3
38#define MSGTMP1 %xmm4
39#define MSGTMP2 %xmm5
40#define MSGTMP3 %xmm6
Denys Vlasenko6472ac92022-02-03 14:15:20 +010041
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010042#define XMMTMP %xmm7
43
Denys Vlasenko461a9942022-02-09 01:30:23 +010044#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
45
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010046 .balign 8 # allow decoders to fetch at least 2 first insns
Denys Vlasenko6472ac92022-02-03 14:15:20 +010047sha256_process_block64_shaNI:
Denys Vlasenko6472ac92022-02-03 14:15:20 +010048
Denys Vlasenko6f56fa12022-02-10 15:38:10 +010049 movu128 76+0*16(%eax), XMMTMP /* ABCD (little-endian dword order) */
50 movu128 76+1*16(%eax), STATE1 /* EFGH */
Denys Vlasenko461a9942022-02-09 01:30:23 +010051/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
Denys Vlasenkocaa9c4f2022-02-09 01:50:22 +010052 mova128 STATE1, STATE0
Denys Vlasenko6f56fa12022-02-10 15:38:10 +010053 /* --- -------------- ABCD -- EFGH */
54 shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */
55 shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */
Denys Vlasenko6472ac92022-02-03 14:15:20 +010056
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010057/* XMMTMP holds flip mask from here... */
58 mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +010059 movl $K256+8*16, SHA256CONSTANTS
Denys Vlasenko6472ac92022-02-03 14:15:20 +010060
Denys Vlasenko6472ac92022-02-03 14:15:20 +010061 /* Rounds 0-3 */
62 movu128 0*16(DATA_PTR), MSG
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010063 pshufb XMMTMP, MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010064 mova128 MSG, MSGTMP0
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +010065 paddd 0*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010066 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenko6472ac92022-02-03 14:15:20 +010067 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010068 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +010069
70 /* Rounds 4-7 */
71 movu128 1*16(DATA_PTR), MSG
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010072 pshufb XMMTMP, MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010073 mova128 MSG, MSGTMP1
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +010074 paddd 1*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010075 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenko6472ac92022-02-03 14:15:20 +010076 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010077 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +010078 sha256msg1 MSGTMP1, MSGTMP0
79
80 /* Rounds 8-11 */
81 movu128 2*16(DATA_PTR), MSG
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010082 pshufb XMMTMP, MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010083 mova128 MSG, MSGTMP2
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +010084 paddd 2*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010085 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenko6472ac92022-02-03 14:15:20 +010086 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010087 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +010088 sha256msg1 MSGTMP2, MSGTMP1
89
90 /* Rounds 12-15 */
91 movu128 3*16(DATA_PTR), MSG
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010092 pshufb XMMTMP, MSG
Denys Vlasenko4f407352022-02-06 00:55:52 +010093/* ...to here */
Denys Vlasenko6472ac92022-02-03 14:15:20 +010094 mova128 MSG, MSGTMP3
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +010095 paddd 3*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +010096 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010097 mova128 MSGTMP3, XMMTMP
98 palignr $4, MSGTMP2, XMMTMP
99 paddd XMMTMP, MSGTMP0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100100 sha256msg2 MSGTMP3, MSGTMP0
101 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100102 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100103 sha256msg1 MSGTMP3, MSGTMP2
104
105 /* Rounds 16-19 */
106 mova128 MSGTMP0, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100107 paddd 4*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100108 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100109 mova128 MSGTMP0, XMMTMP
110 palignr $4, MSGTMP3, XMMTMP
111 paddd XMMTMP, MSGTMP1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100112 sha256msg2 MSGTMP0, MSGTMP1
113 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100114 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100115 sha256msg1 MSGTMP0, MSGTMP3
116
117 /* Rounds 20-23 */
118 mova128 MSGTMP1, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100119 paddd 5*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100120 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100121 mova128 MSGTMP1, XMMTMP
122 palignr $4, MSGTMP0, XMMTMP
123 paddd XMMTMP, MSGTMP2
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100124 sha256msg2 MSGTMP1, MSGTMP2
125 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100126 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100127 sha256msg1 MSGTMP1, MSGTMP0
128
129 /* Rounds 24-27 */
130 mova128 MSGTMP2, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100131 paddd 6*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100132 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100133 mova128 MSGTMP2, XMMTMP
134 palignr $4, MSGTMP1, XMMTMP
135 paddd XMMTMP, MSGTMP3
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100136 sha256msg2 MSGTMP2, MSGTMP3
137 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100138 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100139 sha256msg1 MSGTMP2, MSGTMP1
140
141 /* Rounds 28-31 */
142 mova128 MSGTMP3, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100143 paddd 7*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100144 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100145 mova128 MSGTMP3, XMMTMP
146 palignr $4, MSGTMP2, XMMTMP
147 paddd XMMTMP, MSGTMP0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100148 sha256msg2 MSGTMP3, MSGTMP0
149 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100150 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100151 sha256msg1 MSGTMP3, MSGTMP2
152
153 /* Rounds 32-35 */
154 mova128 MSGTMP0, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100155 paddd 8*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100156 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100157 mova128 MSGTMP0, XMMTMP
158 palignr $4, MSGTMP3, XMMTMP
159 paddd XMMTMP, MSGTMP1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100160 sha256msg2 MSGTMP0, MSGTMP1
161 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100162 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100163 sha256msg1 MSGTMP0, MSGTMP3
164
165 /* Rounds 36-39 */
166 mova128 MSGTMP1, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100167 paddd 9*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100168 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100169 mova128 MSGTMP1, XMMTMP
170 palignr $4, MSGTMP0, XMMTMP
171 paddd XMMTMP, MSGTMP2
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100172 sha256msg2 MSGTMP1, MSGTMP2
173 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100174 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100175 sha256msg1 MSGTMP1, MSGTMP0
176
177 /* Rounds 40-43 */
178 mova128 MSGTMP2, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100179 paddd 10*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100180 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100181 mova128 MSGTMP2, XMMTMP
182 palignr $4, MSGTMP1, XMMTMP
183 paddd XMMTMP, MSGTMP3
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100184 sha256msg2 MSGTMP2, MSGTMP3
185 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100186 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100187 sha256msg1 MSGTMP2, MSGTMP1
188
189 /* Rounds 44-47 */
190 mova128 MSGTMP3, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100191 paddd 11*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100192 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100193 mova128 MSGTMP3, XMMTMP
194 palignr $4, MSGTMP2, XMMTMP
195 paddd XMMTMP, MSGTMP0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100196 sha256msg2 MSGTMP3, MSGTMP0
197 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100198 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100199 sha256msg1 MSGTMP3, MSGTMP2
200
201 /* Rounds 48-51 */
202 mova128 MSGTMP0, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100203 paddd 12*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100204 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100205 mova128 MSGTMP0, XMMTMP
206 palignr $4, MSGTMP3, XMMTMP
207 paddd XMMTMP, MSGTMP1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100208 sha256msg2 MSGTMP0, MSGTMP1
209 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100210 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100211 sha256msg1 MSGTMP0, MSGTMP3
212
213 /* Rounds 52-55 */
214 mova128 MSGTMP1, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100215 paddd 13*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100216 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100217 mova128 MSGTMP1, XMMTMP
218 palignr $4, MSGTMP0, XMMTMP
219 paddd XMMTMP, MSGTMP2
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100220 sha256msg2 MSGTMP1, MSGTMP2
221 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100222 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100223
224 /* Rounds 56-59 */
225 mova128 MSGTMP2, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100226 paddd 14*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100227 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100228 mova128 MSGTMP2, XMMTMP
229 palignr $4, MSGTMP1, XMMTMP
230 paddd XMMTMP, MSGTMP3
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100231 sha256msg2 MSGTMP2, MSGTMP3
232 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100233 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100234
235 /* Rounds 60-63 */
236 mova128 MSGTMP3, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100237 paddd 15*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100238 sha256rnds2 MSG, STATE0, STATE1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100239 shuf128_32 $0x0E, MSG, MSG
Denys Vlasenkoc2e77802022-02-12 00:52:12 +0100240 sha256rnds2 MSG, STATE1, STATE0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100241
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100242 /* Write hash values back in the correct order */
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100243 mova128 STATE0, XMMTMP
Denys Vlasenko461a9942022-02-09 01:30:23 +0100244/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
Denys Vlasenko6f56fa12022-02-10 15:38:10 +0100245 /* --- -------------- HGDC -- FEBA */
246 shufps SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */
247 shufps SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100248 /* add current hash values to previous ones */
Denys Vlasenko461a9942022-02-09 01:30:23 +0100249 movu128 76+1*16(%eax), STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100250 paddd XMMTMP, STATE1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100251 movu128 STATE1, 76+1*16(%eax)
Denys Vlasenko461a9942022-02-09 01:30:23 +0100252 movu128 76+0*16(%eax), XMMTMP
253 paddd XMMTMP, STATE0
254 movu128 STATE0, 76+0*16(%eax)
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100255
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100256 ret
257 .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
258
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100259 .section .rodata.cst256.K256, "aM", @progbits, 256
Denys Vlasenko6a6c1c02022-02-09 11:29:23 +0100260 .balign 16
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100261K256:
262 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
263 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
264 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
265 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
266 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
267 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
268 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
269 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
270 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
271 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
272 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
273 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
274 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
275 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
276 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
277 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
278
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100279 .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
Denys Vlasenko6a6c1c02022-02-09 11:29:23 +0100280 .balign 16
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100281PSHUFFLE_BSWAP32_FLIP_MASK:
Denys Vlasenko6a6c1c02022-02-09 11:29:23 +0100282 .octa 0x0c0d0e0f08090a0b0405060700010203
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100283
284#endif