blob: 6b12d1462d251160e04be8b424144fa98415f2c1 [file] [log] [blame]
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +01001#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__i386__)
2/* The code is adapted from Linux kernel's source */
3
4// We use shorter insns, even though they are for "wrong"
5// data type (fp, not int).
6// For Intel, there is no penalty for doing it at all
7// (CPUs which do have such penalty do not support SHA1 insns).
8// For AMD, the penalty is one extra cycle
9// (allegedly: I failed to find measurable difference).
10
11//#define mova128 movdqa
12#define mova128 movaps
13//#define movu128 movdqu
14#define movu128 movups
15//#define xor128 pxor
16#define xor128 xorps
17//#define shuf128_32 pshufd
18#define shuf128_32 shufps
19
20#define extr128_32 pextrd
21//#define extr128_32 extractps # not shorter
22
23 .section .text.sha1_process_block64_shaNI,"ax",@progbits
24 .globl sha1_process_block64_shaNI
25 .hidden sha1_process_block64_shaNI
26 .type sha1_process_block64_shaNI, @function
27
28#define ABCD %xmm0
29#define E0 %xmm1 /* Need two E's b/c they ping pong */
30#define E1 %xmm2
31#define MSG0 %xmm3
32#define MSG1 %xmm4
33#define MSG2 %xmm5
34#define MSG3 %xmm6
35#define SHUF_MASK %xmm7
36
Denys Vlasenko0bab5da2022-01-07 14:55:31 +010037 .balign 8 # allow decoders to fetch at least 3 first insns
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +010038sha1_process_block64_shaNI:
39 pushl %ebp
40 movl %esp, %ebp
41 subl $32, %esp
42 andl $~0xF, %esp # paddd needs aligned memory operand
43
44 /* load initial hash values */
45 xor128 E0, E0
46 movu128 76(%eax), ABCD
Denys Vlasenko0bab5da2022-01-07 14:55:31 +010047 pinsrd $3, 76+4*4(%eax), E0 # load to uppermost 32-bit word
48 shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +010049
50 mova128 PSHUFFLE_BYTE_FLIP_MASK, SHUF_MASK
51
52 /* Save hash values for addition after rounds */
53 movu128 E0, 16(%esp)
54 movu128 ABCD, (%esp)
55
56 /* Rounds 0-3 */
57 movu128 0*16(%eax), MSG0
58 pshufb SHUF_MASK, MSG0
59 paddd MSG0, E0
60 mova128 ABCD, E1
61 sha1rnds4 $0, E0, ABCD
62
63 /* Rounds 4-7 */
64 movu128 1*16(%eax), MSG1
65 pshufb SHUF_MASK, MSG1
66 sha1nexte MSG1, E1
67 mova128 ABCD, E0
68 sha1rnds4 $0, E1, ABCD
69 sha1msg1 MSG1, MSG0
70
71 /* Rounds 8-11 */
72 movu128 2*16(%eax), MSG2
73 pshufb SHUF_MASK, MSG2
74 sha1nexte MSG2, E0
75 mova128 ABCD, E1
76 sha1rnds4 $0, E0, ABCD
77 sha1msg1 MSG2, MSG1
78 xor128 MSG2, MSG0
79
80 /* Rounds 12-15 */
81 movu128 3*16(%eax), MSG3
82 pshufb SHUF_MASK, MSG3
83 sha1nexte MSG3, E1
84 mova128 ABCD, E0
85 sha1msg2 MSG3, MSG0
86 sha1rnds4 $0, E1, ABCD
87 sha1msg1 MSG3, MSG2
88 xor128 MSG3, MSG1
89
90 /* Rounds 16-19 */
91 sha1nexte MSG0, E0
92 mova128 ABCD, E1
93 sha1msg2 MSG0, MSG1
94 sha1rnds4 $0, E0, ABCD
95 sha1msg1 MSG0, MSG3
96 xor128 MSG0, MSG2
97
98 /* Rounds 20-23 */
99 sha1nexte MSG1, E1
100 mova128 ABCD, E0
101 sha1msg2 MSG1, MSG2
102 sha1rnds4 $1, E1, ABCD
103 sha1msg1 MSG1, MSG0
104 xor128 MSG1, MSG3
105
106 /* Rounds 24-27 */
107 sha1nexte MSG2, E0
108 mova128 ABCD, E1
109 sha1msg2 MSG2, MSG3
110 sha1rnds4 $1, E0, ABCD
111 sha1msg1 MSG2, MSG1
112 xor128 MSG2, MSG0
113
114 /* Rounds 28-31 */
115 sha1nexte MSG3, E1
116 mova128 ABCD, E0
117 sha1msg2 MSG3, MSG0
118 sha1rnds4 $1, E1, ABCD
119 sha1msg1 MSG3, MSG2
120 xor128 MSG3, MSG1
121
122 /* Rounds 32-35 */
123 sha1nexte MSG0, E0
124 mova128 ABCD, E1
125 sha1msg2 MSG0, MSG1
126 sha1rnds4 $1, E0, ABCD
127 sha1msg1 MSG0, MSG3
128 xor128 MSG0, MSG2
129
130 /* Rounds 36-39 */
131 sha1nexte MSG1, E1
132 mova128 ABCD, E0
133 sha1msg2 MSG1, MSG2
134 sha1rnds4 $1, E1, ABCD
135 sha1msg1 MSG1, MSG0
136 xor128 MSG1, MSG3
137
138 /* Rounds 40-43 */
139 sha1nexte MSG2, E0
140 mova128 ABCD, E1
141 sha1msg2 MSG2, MSG3
142 sha1rnds4 $2, E0, ABCD
143 sha1msg1 MSG2, MSG1
144 xor128 MSG2, MSG0
145
146 /* Rounds 44-47 */
147 sha1nexte MSG3, E1
148 mova128 ABCD, E0
149 sha1msg2 MSG3, MSG0
150 sha1rnds4 $2, E1, ABCD
151 sha1msg1 MSG3, MSG2
152 xor128 MSG3, MSG1
153
154 /* Rounds 48-51 */
155 sha1nexte MSG0, E0
156 mova128 ABCD, E1
157 sha1msg2 MSG0, MSG1
158 sha1rnds4 $2, E0, ABCD
159 sha1msg1 MSG0, MSG3
160 xor128 MSG0, MSG2
161
162 /* Rounds 52-55 */
163 sha1nexte MSG1, E1
164 mova128 ABCD, E0
165 sha1msg2 MSG1, MSG2
166 sha1rnds4 $2, E1, ABCD
167 sha1msg1 MSG1, MSG0
168 xor128 MSG1, MSG3
169
170 /* Rounds 56-59 */
171 sha1nexte MSG2, E0
172 mova128 ABCD, E1
173 sha1msg2 MSG2, MSG3
174 sha1rnds4 $2, E0, ABCD
175 sha1msg1 MSG2, MSG1
176 xor128 MSG2, MSG0
177
178 /* Rounds 60-63 */
179 sha1nexte MSG3, E1
180 mova128 ABCD, E0
181 sha1msg2 MSG3, MSG0
182 sha1rnds4 $3, E1, ABCD
183 sha1msg1 MSG3, MSG2
184 xor128 MSG3, MSG1
185
186 /* Rounds 64-67 */
187 sha1nexte MSG0, E0
188 mova128 ABCD, E1
189 sha1msg2 MSG0, MSG1
190 sha1rnds4 $3, E0, ABCD
191 sha1msg1 MSG0, MSG3
192 xor128 MSG0, MSG2
193
194 /* Rounds 68-71 */
195 sha1nexte MSG1, E1
196 mova128 ABCD, E0
197 sha1msg2 MSG1, MSG2
198 sha1rnds4 $3, E1, ABCD
199 xor128 MSG1, MSG3
200
201 /* Rounds 72-75 */
202 sha1nexte MSG2, E0
203 mova128 ABCD, E1
204 sha1msg2 MSG2, MSG3
205 sha1rnds4 $3, E0, ABCD
206
207 /* Rounds 76-79 */
208 sha1nexte MSG3, E1
209 mova128 ABCD, E0
210 sha1rnds4 $3, E1, ABCD
211
212 /* Add current hash values with previously saved */
213 sha1nexte 16(%esp), E0
214 paddd (%esp), ABCD
215
216 /* Write hash values back in the correct order */
217 shuf128_32 $0x1B, ABCD, ABCD
218 movu128 ABCD, 76(%eax)
219 extr128_32 $3, E0, 76+4*4(%eax)
220
221 movl %ebp, %esp
222 popl %ebp
223 ret
224 .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
225
226.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
227.align 16
228PSHUFFLE_BYTE_FLIP_MASK:
229 .octa 0x000102030405060708090a0b0c0d0e0f
230
231#endif