blob: 794e97040e1398b7ef2b6035c262425570e596cf [file] [log] [blame]
Denys Vlasenko711e20e2022-01-07 00:43:59 +01001#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
2/* The code is adapted from Linux kernel's source */
3
4// We use shorter insns, even though they are for "wrong"
5// data type (fp, not int).
6// For Intel, there is no penalty for doing it at all
Denys Vlasenko6f56fa12022-02-10 15:38:10 +01007// (CPUs which do have such penalty do not support SHA insns).
Denys Vlasenko711e20e2022-01-07 00:43:59 +01008// For AMD, the penalty is one extra cycle
9// (allegedly: I failed to find measurable difference).
10
11//#define mova128 movdqa
12#define mova128 movaps
13//#define movu128 movdqu
14#define movu128 movups
15//#define xor128 pxor
16#define xor128 xorps
17//#define shuf128_32 pshufd
18#define shuf128_32 shufps
19
20#define extr128_32 pextrd
Denys Vlasenko805ecec2022-01-08 00:41:09 +010021//#define extr128_32 extractps # not shorter
Denys Vlasenko711e20e2022-01-07 00:43:59 +010022
Denys Vlasenkodda77e82022-02-11 14:53:26 +010023// pshufb is a SSSE3 insn.
24// pinsrd, pextrd, extractps are SSE4.1 insns.
25// We do not check SSSE3/SSE4.1 in cpuid,
26// all SHA-capable CPUs support them as well.
27
Denys Vlasenko6472ac92022-02-03 14:15:20 +010028 .section .text.sha1_process_block64_shaNI, "ax", @progbits
Denys Vlasenko805ecec2022-01-08 00:41:09 +010029 .globl sha1_process_block64_shaNI
30 .hidden sha1_process_block64_shaNI
Denys Vlasenko711e20e2022-01-07 00:43:59 +010031 .type sha1_process_block64_shaNI, @function
32
33#define ABCD %xmm0
34#define E0 %xmm1 /* Need two E's b/c they ping pong */
35#define E1 %xmm2
36#define MSG0 %xmm3
37#define MSG1 %xmm4
38#define MSG2 %xmm5
39#define MSG3 %xmm6
Denys Vlasenko711e20e2022-01-07 00:43:59 +010040
41 .balign 8 # allow decoders to fetch at least 2 first insns
42sha1_process_block64_shaNI:
43 /* load initial hash values */
Denys Vlasenko711e20e2022-01-07 00:43:59 +010044 movu128 80(%rdi), ABCD
Denys Vlasenkoeb8d5f32022-02-08 15:34:02 +010045 xor128 E0, E0
Denys Vlasenko0bab5da2022-01-07 14:55:31 +010046 pinsrd $3, 80+4*4(%rdi), E0 # load to uppermost 32-bit word
47 shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD
Denys Vlasenko711e20e2022-01-07 00:43:59 +010048
Denys Vlasenko71a1ccc2022-02-08 08:20:27 +010049 mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), %xmm7
50
51 movu128 0*16(%rdi), MSG0
52 pshufb %xmm7, MSG0
53 movu128 1*16(%rdi), MSG1
54 pshufb %xmm7, MSG1
55 movu128 2*16(%rdi), MSG2
56 pshufb %xmm7, MSG2
57 movu128 3*16(%rdi), MSG3
58 pshufb %xmm7, MSG3
Denys Vlasenko711e20e2022-01-07 00:43:59 +010059
60 /* Save hash values for addition after rounds */
Denys Vlasenko71a1ccc2022-02-08 08:20:27 +010061 mova128 E0, %xmm7
Denys Vlasenko711e20e2022-01-07 00:43:59 +010062 mova128 ABCD, %xmm8
63
64 /* Rounds 0-3 */
Denys Vlasenko711e20e2022-01-07 00:43:59 +010065 paddd MSG0, E0
66 mova128 ABCD, E1
67 sha1rnds4 $0, E0, ABCD
68
69 /* Rounds 4-7 */
Denys Vlasenko711e20e2022-01-07 00:43:59 +010070 sha1nexte MSG1, E1
71 mova128 ABCD, E0
72 sha1rnds4 $0, E1, ABCD
73 sha1msg1 MSG1, MSG0
74
75 /* Rounds 8-11 */
Denys Vlasenko711e20e2022-01-07 00:43:59 +010076 sha1nexte MSG2, E0
77 mova128 ABCD, E1
78 sha1rnds4 $0, E0, ABCD
79 sha1msg1 MSG2, MSG1
80 xor128 MSG2, MSG0
81
82 /* Rounds 12-15 */
Denys Vlasenko711e20e2022-01-07 00:43:59 +010083 sha1nexte MSG3, E1
84 mova128 ABCD, E0
85 sha1msg2 MSG3, MSG0
86 sha1rnds4 $0, E1, ABCD
87 sha1msg1 MSG3, MSG2
88 xor128 MSG3, MSG1
89
90 /* Rounds 16-19 */
91 sha1nexte MSG0, E0
92 mova128 ABCD, E1
93 sha1msg2 MSG0, MSG1
94 sha1rnds4 $0, E0, ABCD
95 sha1msg1 MSG0, MSG3
96 xor128 MSG0, MSG2
97
98 /* Rounds 20-23 */
99 sha1nexte MSG1, E1
100 mova128 ABCD, E0
101 sha1msg2 MSG1, MSG2
102 sha1rnds4 $1, E1, ABCD
103 sha1msg1 MSG1, MSG0
104 xor128 MSG1, MSG3
105
106 /* Rounds 24-27 */
107 sha1nexte MSG2, E0
108 mova128 ABCD, E1
109 sha1msg2 MSG2, MSG3
110 sha1rnds4 $1, E0, ABCD
111 sha1msg1 MSG2, MSG1
112 xor128 MSG2, MSG0
113
114 /* Rounds 28-31 */
115 sha1nexte MSG3, E1
116 mova128 ABCD, E0
117 sha1msg2 MSG3, MSG0
118 sha1rnds4 $1, E1, ABCD
119 sha1msg1 MSG3, MSG2
120 xor128 MSG3, MSG1
121
122 /* Rounds 32-35 */
123 sha1nexte MSG0, E0
124 mova128 ABCD, E1
125 sha1msg2 MSG0, MSG1
126 sha1rnds4 $1, E0, ABCD
127 sha1msg1 MSG0, MSG3
128 xor128 MSG0, MSG2
129
130 /* Rounds 36-39 */
131 sha1nexte MSG1, E1
132 mova128 ABCD, E0
133 sha1msg2 MSG1, MSG2
134 sha1rnds4 $1, E1, ABCD
135 sha1msg1 MSG1, MSG0
136 xor128 MSG1, MSG3
137
138 /* Rounds 40-43 */
139 sha1nexte MSG2, E0
140 mova128 ABCD, E1
141 sha1msg2 MSG2, MSG3
142 sha1rnds4 $2, E0, ABCD
143 sha1msg1 MSG2, MSG1
144 xor128 MSG2, MSG0
145
146 /* Rounds 44-47 */
147 sha1nexte MSG3, E1
148 mova128 ABCD, E0
149 sha1msg2 MSG3, MSG0
150 sha1rnds4 $2, E1, ABCD
151 sha1msg1 MSG3, MSG2
152 xor128 MSG3, MSG1
153
154 /* Rounds 48-51 */
155 sha1nexte MSG0, E0
156 mova128 ABCD, E1
157 sha1msg2 MSG0, MSG1
158 sha1rnds4 $2, E0, ABCD
159 sha1msg1 MSG0, MSG3
160 xor128 MSG0, MSG2
161
162 /* Rounds 52-55 */
163 sha1nexte MSG1, E1
164 mova128 ABCD, E0
165 sha1msg2 MSG1, MSG2
166 sha1rnds4 $2, E1, ABCD
167 sha1msg1 MSG1, MSG0
168 xor128 MSG1, MSG3
169
170 /* Rounds 56-59 */
171 sha1nexte MSG2, E0
172 mova128 ABCD, E1
173 sha1msg2 MSG2, MSG3
174 sha1rnds4 $2, E0, ABCD
175 sha1msg1 MSG2, MSG1
176 xor128 MSG2, MSG0
177
178 /* Rounds 60-63 */
179 sha1nexte MSG3, E1
180 mova128 ABCD, E0
181 sha1msg2 MSG3, MSG0
182 sha1rnds4 $3, E1, ABCD
183 sha1msg1 MSG3, MSG2
184 xor128 MSG3, MSG1
185
186 /* Rounds 64-67 */
187 sha1nexte MSG0, E0
188 mova128 ABCD, E1
189 sha1msg2 MSG0, MSG1
190 sha1rnds4 $3, E0, ABCD
191 sha1msg1 MSG0, MSG3
192 xor128 MSG0, MSG2
193
194 /* Rounds 68-71 */
195 sha1nexte MSG1, E1
196 mova128 ABCD, E0
197 sha1msg2 MSG1, MSG2
198 sha1rnds4 $3, E1, ABCD
199 xor128 MSG1, MSG3
200
201 /* Rounds 72-75 */
202 sha1nexte MSG2, E0
203 mova128 ABCD, E1
204 sha1msg2 MSG2, MSG3
205 sha1rnds4 $3, E0, ABCD
206
207 /* Rounds 76-79 */
208 sha1nexte MSG3, E1
209 mova128 ABCD, E0
210 sha1rnds4 $3, E1, ABCD
211
212 /* Add current hash values with previously saved */
Denys Vlasenko71a1ccc2022-02-08 08:20:27 +0100213 sha1nexte %xmm7, E0
Denys Vlasenko711e20e2022-01-07 00:43:59 +0100214 paddd %xmm8, ABCD
215
216 /* Write hash values back in the correct order */
217 shuf128_32 $0x1B, ABCD, ABCD
218 movu128 ABCD, 80(%rdi)
219 extr128_32 $3, E0, 80+4*4(%rdi)
220
221 ret
Denys Vlasenko805ecec2022-01-08 00:41:09 +0100222 .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
Denys Vlasenko711e20e2022-01-07 00:43:59 +0100223
Denys Vlasenkoc193cbd2022-02-07 02:06:18 +0100224 .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
Denys Vlasenko6a6c1c02022-02-09 11:29:23 +0100225 .balign 16
Denys Vlasenko711e20e2022-01-07 00:43:59 +0100226PSHUFFLE_BYTE_FLIP_MASK:
Denys Vlasenko6a6c1c02022-02-09 11:29:23 +0100227 .octa 0x000102030405060708090a0b0c0d0e0f
Denys Vlasenko711e20e2022-01-07 00:43:59 +0100228
229#endif