blob: 4b33449d40db77e8b039651681d456e4448ea4c5 [file] [log] [blame]
Denys Vlasenko6472ac92022-02-03 14:15:20 +01001#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__)
2/* The code is adapted from Linux kernel's source */
3
4// We use shorter insns, even though they are for "wrong"
5// data type (fp, not int).
6// For Intel, there is no penalty for doing it at all
Denys Vlasenko6f56fa12022-02-10 15:38:10 +01007// (CPUs which do have such penalty do not support SHA insns).
Denys Vlasenko6472ac92022-02-03 14:15:20 +01008// For AMD, the penalty is one extra cycle
9// (allegedly: I failed to find measurable difference).
10
11//#define mova128 movdqa
12#define mova128 movaps
13//#define movu128 movdqu
14#define movu128 movups
15//#define shuf128_32 pshufd
16#define shuf128_32 shufps
17
18 .section .text.sha256_process_block64_shaNI, "ax", @progbits
19 .globl sha256_process_block64_shaNI
20 .hidden sha256_process_block64_shaNI
21 .type sha256_process_block64_shaNI, @function
22
23#define DATA_PTR %eax
24
25#define SHA256CONSTANTS %ecx
26
27#define MSG %xmm0
28#define STATE0 %xmm1
29#define STATE1 %xmm2
30#define MSGTMP0 %xmm3
31#define MSGTMP1 %xmm4
32#define MSGTMP2 %xmm5
33#define MSGTMP3 %xmm6
Denys Vlasenko6472ac92022-02-03 14:15:20 +010034
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010035#define XMMTMP %xmm7
36
Denys Vlasenko461a9942022-02-09 01:30:23 +010037#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
38
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010039 .balign 8 # allow decoders to fetch at least 2 first insns
Denys Vlasenko6472ac92022-02-03 14:15:20 +010040sha256_process_block64_shaNI:
Denys Vlasenko6472ac92022-02-03 14:15:20 +010041
Denys Vlasenko6f56fa12022-02-10 15:38:10 +010042 movu128 76+0*16(%eax), XMMTMP /* ABCD (little-endian dword order) */
43 movu128 76+1*16(%eax), STATE1 /* EFGH */
Denys Vlasenko461a9942022-02-09 01:30:23 +010044/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
Denys Vlasenkocaa9c4f2022-02-09 01:50:22 +010045 mova128 STATE1, STATE0
Denys Vlasenko6f56fa12022-02-10 15:38:10 +010046 /* --- -------------- ABCD -- EFGH */
47 shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */
48 shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */
Denys Vlasenko6472ac92022-02-03 14:15:20 +010049
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010050/* XMMTMP holds flip mask from here... */
51 mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +010052 movl $K256+8*16, SHA256CONSTANTS
Denys Vlasenko6472ac92022-02-03 14:15:20 +010053
Denys Vlasenko6472ac92022-02-03 14:15:20 +010054 /* Rounds 0-3 */
55 movu128 0*16(DATA_PTR), MSG
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010056 pshufb XMMTMP, MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010057 mova128 MSG, MSGTMP0
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +010058 paddd 0*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010059 sha256rnds2 STATE0, STATE1
60 shuf128_32 $0x0E, MSG, MSG
61 sha256rnds2 STATE1, STATE0
62
63 /* Rounds 4-7 */
64 movu128 1*16(DATA_PTR), MSG
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010065 pshufb XMMTMP, MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010066 mova128 MSG, MSGTMP1
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +010067 paddd 1*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010068 sha256rnds2 STATE0, STATE1
69 shuf128_32 $0x0E, MSG, MSG
70 sha256rnds2 STATE1, STATE0
71 sha256msg1 MSGTMP1, MSGTMP0
72
73 /* Rounds 8-11 */
74 movu128 2*16(DATA_PTR), MSG
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010075 pshufb XMMTMP, MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010076 mova128 MSG, MSGTMP2
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +010077 paddd 2*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010078 sha256rnds2 STATE0, STATE1
79 shuf128_32 $0x0E, MSG, MSG
80 sha256rnds2 STATE1, STATE0
81 sha256msg1 MSGTMP2, MSGTMP1
82
83 /* Rounds 12-15 */
84 movu128 3*16(DATA_PTR), MSG
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010085 pshufb XMMTMP, MSG
Denys Vlasenko4f407352022-02-06 00:55:52 +010086/* ...to here */
Denys Vlasenko6472ac92022-02-03 14:15:20 +010087 mova128 MSG, MSGTMP3
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +010088 paddd 3*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +010089 sha256rnds2 STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +010090 mova128 MSGTMP3, XMMTMP
91 palignr $4, MSGTMP2, XMMTMP
92 paddd XMMTMP, MSGTMP0
Denys Vlasenko6472ac92022-02-03 14:15:20 +010093 sha256msg2 MSGTMP3, MSGTMP0
94 shuf128_32 $0x0E, MSG, MSG
95 sha256rnds2 STATE1, STATE0
96 sha256msg1 MSGTMP3, MSGTMP2
97
98 /* Rounds 16-19 */
99 mova128 MSGTMP0, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100100 paddd 4*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100101 sha256rnds2 STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100102 mova128 MSGTMP0, XMMTMP
103 palignr $4, MSGTMP3, XMMTMP
104 paddd XMMTMP, MSGTMP1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100105 sha256msg2 MSGTMP0, MSGTMP1
106 shuf128_32 $0x0E, MSG, MSG
107 sha256rnds2 STATE1, STATE0
108 sha256msg1 MSGTMP0, MSGTMP3
109
110 /* Rounds 20-23 */
111 mova128 MSGTMP1, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100112 paddd 5*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100113 sha256rnds2 STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100114 mova128 MSGTMP1, XMMTMP
115 palignr $4, MSGTMP0, XMMTMP
116 paddd XMMTMP, MSGTMP2
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100117 sha256msg2 MSGTMP1, MSGTMP2
118 shuf128_32 $0x0E, MSG, MSG
119 sha256rnds2 STATE1, STATE0
120 sha256msg1 MSGTMP1, MSGTMP0
121
122 /* Rounds 24-27 */
123 mova128 MSGTMP2, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100124 paddd 6*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100125 sha256rnds2 STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100126 mova128 MSGTMP2, XMMTMP
127 palignr $4, MSGTMP1, XMMTMP
128 paddd XMMTMP, MSGTMP3
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100129 sha256msg2 MSGTMP2, MSGTMP3
130 shuf128_32 $0x0E, MSG, MSG
131 sha256rnds2 STATE1, STATE0
132 sha256msg1 MSGTMP2, MSGTMP1
133
134 /* Rounds 28-31 */
135 mova128 MSGTMP3, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100136 paddd 7*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100137 sha256rnds2 STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100138 mova128 MSGTMP3, XMMTMP
139 palignr $4, MSGTMP2, XMMTMP
140 paddd XMMTMP, MSGTMP0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100141 sha256msg2 MSGTMP3, MSGTMP0
142 shuf128_32 $0x0E, MSG, MSG
143 sha256rnds2 STATE1, STATE0
144 sha256msg1 MSGTMP3, MSGTMP2
145
146 /* Rounds 32-35 */
147 mova128 MSGTMP0, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100148 paddd 8*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100149 sha256rnds2 STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100150 mova128 MSGTMP0, XMMTMP
151 palignr $4, MSGTMP3, XMMTMP
152 paddd XMMTMP, MSGTMP1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100153 sha256msg2 MSGTMP0, MSGTMP1
154 shuf128_32 $0x0E, MSG, MSG
155 sha256rnds2 STATE1, STATE0
156 sha256msg1 MSGTMP0, MSGTMP3
157
158 /* Rounds 36-39 */
159 mova128 MSGTMP1, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100160 paddd 9*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100161 sha256rnds2 STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100162 mova128 MSGTMP1, XMMTMP
163 palignr $4, MSGTMP0, XMMTMP
164 paddd XMMTMP, MSGTMP2
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100165 sha256msg2 MSGTMP1, MSGTMP2
166 shuf128_32 $0x0E, MSG, MSG
167 sha256rnds2 STATE1, STATE0
168 sha256msg1 MSGTMP1, MSGTMP0
169
170 /* Rounds 40-43 */
171 mova128 MSGTMP2, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100172 paddd 10*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100173 sha256rnds2 STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100174 mova128 MSGTMP2, XMMTMP
175 palignr $4, MSGTMP1, XMMTMP
176 paddd XMMTMP, MSGTMP3
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100177 sha256msg2 MSGTMP2, MSGTMP3
178 shuf128_32 $0x0E, MSG, MSG
179 sha256rnds2 STATE1, STATE0
180 sha256msg1 MSGTMP2, MSGTMP1
181
182 /* Rounds 44-47 */
183 mova128 MSGTMP3, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100184 paddd 11*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100185 sha256rnds2 STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100186 mova128 MSGTMP3, XMMTMP
187 palignr $4, MSGTMP2, XMMTMP
188 paddd XMMTMP, MSGTMP0
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100189 sha256msg2 MSGTMP3, MSGTMP0
190 shuf128_32 $0x0E, MSG, MSG
191 sha256rnds2 STATE1, STATE0
192 sha256msg1 MSGTMP3, MSGTMP2
193
194 /* Rounds 48-51 */
195 mova128 MSGTMP0, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100196 paddd 12*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100197 sha256rnds2 STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100198 mova128 MSGTMP0, XMMTMP
199 palignr $4, MSGTMP3, XMMTMP
200 paddd XMMTMP, MSGTMP1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100201 sha256msg2 MSGTMP0, MSGTMP1
202 shuf128_32 $0x0E, MSG, MSG
203 sha256rnds2 STATE1, STATE0
204 sha256msg1 MSGTMP0, MSGTMP3
205
206 /* Rounds 52-55 */
207 mova128 MSGTMP1, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100208 paddd 13*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100209 sha256rnds2 STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100210 mova128 MSGTMP1, XMMTMP
211 palignr $4, MSGTMP0, XMMTMP
212 paddd XMMTMP, MSGTMP2
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100213 sha256msg2 MSGTMP1, MSGTMP2
214 shuf128_32 $0x0E, MSG, MSG
215 sha256rnds2 STATE1, STATE0
216
217 /* Rounds 56-59 */
218 mova128 MSGTMP2, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100219 paddd 14*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100220 sha256rnds2 STATE0, STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100221 mova128 MSGTMP2, XMMTMP
222 palignr $4, MSGTMP1, XMMTMP
223 paddd XMMTMP, MSGTMP3
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100224 sha256msg2 MSGTMP2, MSGTMP3
225 shuf128_32 $0x0E, MSG, MSG
226 sha256rnds2 STATE1, STATE0
227
228 /* Rounds 60-63 */
229 mova128 MSGTMP3, MSG
Denys Vlasenkode6cb4b2022-02-03 15:11:23 +0100230 paddd 15*16-8*16(SHA256CONSTANTS), MSG
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100231 sha256rnds2 STATE0, STATE1
232 shuf128_32 $0x0E, MSG, MSG
233 sha256rnds2 STATE1, STATE0
234
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100235 /* Write hash values back in the correct order */
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100236 mova128 STATE0, XMMTMP
Denys Vlasenko461a9942022-02-09 01:30:23 +0100237/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
Denys Vlasenko6f56fa12022-02-10 15:38:10 +0100238 /* --- -------------- HGDC -- FEBA */
239 shufps SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */
240 shufps SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100241 /* add current hash values to previous ones */
Denys Vlasenko461a9942022-02-09 01:30:23 +0100242 movu128 76+1*16(%eax), STATE1
Denys Vlasenkoc0ff0d42022-02-09 00:33:39 +0100243 paddd XMMTMP, STATE1
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100244 movu128 STATE1, 76+1*16(%eax)
Denys Vlasenko461a9942022-02-09 01:30:23 +0100245 movu128 76+0*16(%eax), XMMTMP
246 paddd XMMTMP, STATE0
247 movu128 STATE0, 76+0*16(%eax)
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100248
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100249 ret
250 .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
251
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100252 .section .rodata.cst256.K256, "aM", @progbits, 256
Denys Vlasenko6a6c1c02022-02-09 11:29:23 +0100253 .balign 16
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100254K256:
255 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
256 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
257 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
258 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
259 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
260 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
261 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
262 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
263 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
264 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
265 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
266 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
267 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
268 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
269 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
270 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
271
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100272 .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
Denys Vlasenko6a6c1c02022-02-09 11:29:23 +0100273 .balign 16
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100274PSHUFFLE_BSWAP32_FLIP_MASK:
Denys Vlasenko6a6c1c02022-02-09 11:29:23 +0100275 .octa 0x0c0d0e0f08090a0b0405060700010203
Denys Vlasenko6472ac92022-02-03 14:15:20 +0100276
277#endif