blob: 880ffab014ac9b57a56988099c79edd8b46f14ca [file] [log] [blame]
"Robert P. J. Day"63fc1a92006-07-02 19:47:05 +00001/* vi: set sw=4 ts=4: */
Rob Landley5cf7c2d2006-02-21 06:44:43 +00002/*
Denys Vlasenkoc48a5c62010-10-18 14:48:30 +02003 * Utility routines.
4 *
5 * Copyright (C) 2010 Denys Vlasenko
6 *
7 * Licensed under GPLv2 or later, see file LICENSE in this source tree.
8 */
Denys Vlasenkoc48a5c62010-10-18 14:48:30 +02009#include "libbb.h"
10
Denys Vlasenko5f681702022-01-01 12:21:01 +010011#define STR1(s) #s
12#define STR(s) STR1(s)
13
Denys Vlasenkob8935d02017-01-15 20:16:27 +010014#define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA)
15
Denys Vlasenko6472ac92022-02-03 14:15:20 +010016#if ENABLE_SHA1_HWACCEL || ENABLE_SHA256_HWACCEL
17# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
18static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
19{
20 asm ("cpuid"
21 : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
22 : "0"(*eax), "1"(*ebx), "2"(*ecx), "3"(*edx)
23 );
24}
25static smallint shaNI;
26void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx);
27void FAST_FUNC sha256_process_block64_shaNI(sha256_ctx_t *ctx);
28# if defined(__i386__)
29struct ASM_expects_76_shaNI { char t[1 - 2*(offsetof(sha256_ctx_t, hash) != 76)]; };
30# endif
31# if defined(__x86_64__)
32struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha256_ctx_t, hash) != 80)]; };
33# endif
34# endif
35#endif
36
Denys Vlasenkoc48a5c62010-10-18 14:48:30 +020037/* gcc 4.2.1 optimizes rotr64 better with inline than with macro
38 * (for rotX32, there is no difference). Why? My guess is that
39 * macro requires clever common subexpression elimination heuristics
40 * in gcc, while inline basically forces it to happen.
41 */
42//#define rotl32(x,n) (((x) << (n)) | ((x) >> (32 - (n))))
43static ALWAYS_INLINE uint32_t rotl32(uint32_t x, unsigned n)
44{
45 return (x << n) | (x >> (32 - n));
46}
47//#define rotr32(x,n) (((x) >> (n)) | ((x) << (32 - (n))))
48static ALWAYS_INLINE uint32_t rotr32(uint32_t x, unsigned n)
49{
50 return (x >> n) | (x << (32 - n));
51}
52/* rotr64 in needed for sha512 only: */
53//#define rotr64(x,n) (((x) >> (n)) | ((x) << (64 - (n))))
54static ALWAYS_INLINE uint64_t rotr64(uint64_t x, unsigned n)
55{
56 return (x >> n) | (x << (64 - n));
57}
58
Lauri Kasanenb8173b62013-01-14 05:20:50 +010059/* rotl64 only used for sha3 currently */
60static ALWAYS_INLINE uint64_t rotl64(uint64_t x, unsigned n)
61{
62 return (x << n) | (x >> (64 - n));
63}
Denys Vlasenkoc48a5c62010-10-18 14:48:30 +020064
Denys Vlasenko302ad142010-10-19 02:16:12 +020065/* Process the remaining bytes in the buffer */
66static void FAST_FUNC common64_end(md5_ctx_t *ctx, int swap_needed)
67{
Denys Vlasenkoc48a5c62010-10-18 14:48:30 +020068 unsigned bufpos = ctx->total64 & 63;
69 /* Pad the buffer to the next 64-byte boundary with 0x80,0,0,0... */
70 ctx->wbuffer[bufpos++] = 0x80;
71
72 /* This loop iterates either once or twice, no more, no less */
73 while (1) {
74 unsigned remaining = 64 - bufpos;
75 memset(ctx->wbuffer + bufpos, 0, remaining);
76 /* Do we have enough space for the length count? */
77 if (remaining >= 8) {
78 /* Store the 64-bit counter of bits in the buffer */
79 uint64_t t = ctx->total64 << 3;
80 if (swap_needed)
81 t = bb_bswap_64(t);
82 /* wbuffer is suitably aligned for this */
Denys Vlasenko1f5e81f2013-06-27 01:03:19 +020083 *(bb__aliased_uint64_t *) (&ctx->wbuffer[64 - 8]) = t;
Denys Vlasenkoc48a5c62010-10-18 14:48:30 +020084 }
Denys Vlasenko302ad142010-10-19 02:16:12 +020085 ctx->process_block(ctx);
Denys Vlasenkoc48a5c62010-10-18 14:48:30 +020086 if (remaining >= 8)
87 break;
88 bufpos = 0;
89 }
90}
91
92
93/*
Denys Vlasenko302ad142010-10-19 02:16:12 +020094 * Compute MD5 checksum of strings according to the
95 * definition of MD5 in RFC 1321 from April 1992.
96 *
97 * Written by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1995.
98 *
99 * Copyright (C) 1995-1999 Free Software Foundation, Inc.
100 * Copyright (C) 2001 Manuel Novoa III
101 * Copyright (C) 2003 Glenn L. McGrath
102 * Copyright (C) 2003 Erik Andersen
103 *
104 * Licensed under GPLv2 or later, see file LICENSE in this source tree.
105 */
106
107/* 0: fastest, 3: smallest */
Denys Vlasenko522041e2011-09-10 13:25:57 +0200108#if CONFIG_MD5_SMALL < 0
109# define MD5_SMALL 0
110#elif CONFIG_MD5_SMALL > 3
111# define MD5_SMALL 3
Denys Vlasenko302ad142010-10-19 02:16:12 +0200112#else
Denys Vlasenko522041e2011-09-10 13:25:57 +0200113# define MD5_SMALL CONFIG_MD5_SMALL
Denys Vlasenko302ad142010-10-19 02:16:12 +0200114#endif
115
116/* These are the four functions used in the four steps of the MD5 algorithm
117 * and defined in the RFC 1321. The first function is a little bit optimized
118 * (as found in Colin Plumbs public domain implementation).
119 * #define FF(b, c, d) ((b & c) | (~b & d))
120 */
121#undef FF
122#undef FG
123#undef FH
124#undef FI
125#define FF(b, c, d) (d ^ (b & (c ^ d)))
126#define FG(b, c, d) FF(d, b, c)
127#define FH(b, c, d) (b ^ c ^ d)
128#define FI(b, c, d) (c ^ (b | ~d))
129
130/* Hash a single block, 64 bytes long and 4-byte aligned */
131static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
132{
Denys Vlasenko522041e2011-09-10 13:25:57 +0200133#if MD5_SMALL > 0
Denys Vlasenko302ad142010-10-19 02:16:12 +0200134 /* Before we start, one word to the strange constants.
135 They are defined in RFC 1321 as
Denys Vlasenko305958d2015-10-07 19:17:01 +0200136 T[i] = (int)(2^32 * fabs(sin(i))), i=1..64
Denys Vlasenko302ad142010-10-19 02:16:12 +0200137 */
Denys Vlasenko965b7952020-11-30 13:03:03 +0100138 static const uint32_t C_array[] ALIGN4 = {
Denys Vlasenko302ad142010-10-19 02:16:12 +0200139 /* round 1 */
140 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
141 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
142 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
143 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
144 /* round 2 */
145 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
146 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
147 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
148 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
149 /* round 3 */
150 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
151 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
152 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x4881d05,
153 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
154 /* round 4 */
155 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
156 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
157 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
158 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
159 };
160 static const char P_array[] ALIGN1 = {
Denys Vlasenko522041e2011-09-10 13:25:57 +0200161# if MD5_SMALL > 1
Denys Vlasenkofb132e42010-10-29 11:46:52 +0200162 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, /* 1 */
Denys Vlasenko302ad142010-10-19 02:16:12 +0200163# endif
Denys Vlasenkofb132e42010-10-29 11:46:52 +0200164 1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, /* 2 */
165 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, /* 3 */
166 0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9 /* 4 */
Denys Vlasenko302ad142010-10-19 02:16:12 +0200167 };
168#endif
169 uint32_t *words = (void*) ctx->wbuffer;
170 uint32_t A = ctx->hash[0];
171 uint32_t B = ctx->hash[1];
172 uint32_t C = ctx->hash[2];
173 uint32_t D = ctx->hash[3];
174
Denys Vlasenko522041e2011-09-10 13:25:57 +0200175#if MD5_SMALL >= 2 /* 2 or 3 */
Denys Vlasenko302ad142010-10-19 02:16:12 +0200176
177 static const char S_array[] ALIGN1 = {
178 7, 12, 17, 22,
179 5, 9, 14, 20,
180 4, 11, 16, 23,
181 6, 10, 15, 21
182 };
183 const uint32_t *pc;
184 const char *pp;
185 const char *ps;
186 int i;
187 uint32_t temp;
188
Denys Vlasenko5368fe52013-01-16 02:20:31 +0100189 if (BB_BIG_ENDIAN)
190 for (i = 0; i < 16; i++)
191 words[i] = SWAP_LE32(words[i]);
Denys Vlasenko302ad142010-10-19 02:16:12 +0200192
Denys Vlasenko522041e2011-09-10 13:25:57 +0200193# if MD5_SMALL == 3
Denys Vlasenko302ad142010-10-19 02:16:12 +0200194 pc = C_array;
195 pp = P_array;
196 ps = S_array - 4;
197
198 for (i = 0; i < 64; i++) {
199 if ((i & 0x0f) == 0)
200 ps += 4;
201 temp = A;
202 switch (i >> 4) {
203 case 0:
204 temp += FF(B, C, D);
205 break;
206 case 1:
207 temp += FG(B, C, D);
208 break;
209 case 2:
210 temp += FH(B, C, D);
211 break;
Denys Vlasenko305958d2015-10-07 19:17:01 +0200212 default: /* case 3 */
Denys Vlasenko302ad142010-10-19 02:16:12 +0200213 temp += FI(B, C, D);
214 }
215 temp += words[(int) (*pp++)] + *pc++;
216 temp = rotl32(temp, ps[i & 3]);
217 temp += B;
218 A = D;
219 D = C;
220 C = B;
221 B = temp;
222 }
Denys Vlasenko522041e2011-09-10 13:25:57 +0200223# else /* MD5_SMALL == 2 */
Denys Vlasenko302ad142010-10-19 02:16:12 +0200224 pc = C_array;
225 pp = P_array;
226 ps = S_array;
227
228 for (i = 0; i < 16; i++) {
229 temp = A + FF(B, C, D) + words[(int) (*pp++)] + *pc++;
230 temp = rotl32(temp, ps[i & 3]);
231 temp += B;
232 A = D;
233 D = C;
234 C = B;
235 B = temp;
236 }
237 ps += 4;
238 for (i = 0; i < 16; i++) {
239 temp = A + FG(B, C, D) + words[(int) (*pp++)] + *pc++;
240 temp = rotl32(temp, ps[i & 3]);
241 temp += B;
242 A = D;
243 D = C;
244 C = B;
245 B = temp;
246 }
247 ps += 4;
248 for (i = 0; i < 16; i++) {
249 temp = A + FH(B, C, D) + words[(int) (*pp++)] + *pc++;
250 temp = rotl32(temp, ps[i & 3]);
251 temp += B;
252 A = D;
253 D = C;
254 C = B;
255 B = temp;
256 }
257 ps += 4;
258 for (i = 0; i < 16; i++) {
259 temp = A + FI(B, C, D) + words[(int) (*pp++)] + *pc++;
260 temp = rotl32(temp, ps[i & 3]);
261 temp += B;
262 A = D;
263 D = C;
264 C = B;
265 B = temp;
266 }
267# endif
268 /* Add checksum to the starting values */
269 ctx->hash[0] += A;
270 ctx->hash[1] += B;
271 ctx->hash[2] += C;
272 ctx->hash[3] += D;
273
Denys Vlasenko522041e2011-09-10 13:25:57 +0200274#else /* MD5_SMALL == 0 or 1 */
Denys Vlasenko302ad142010-10-19 02:16:12 +0200275
Denys Vlasenko522041e2011-09-10 13:25:57 +0200276# if MD5_SMALL == 1
Denys Vlasenko302ad142010-10-19 02:16:12 +0200277 const uint32_t *pc;
278 const char *pp;
279 int i;
280# endif
281
282 /* First round: using the given function, the context and a constant
283 the next context is computed. Because the algorithm's processing
284 unit is a 32-bit word and it is determined to work on words in
285 little endian byte order we perhaps have to change the byte order
286 before the computation. To reduce the work for the next steps
287 we save swapped words in WORDS array. */
288# undef OP
289# define OP(a, b, c, d, s, T) \
290 do { \
291 a += FF(b, c, d) + (*words IF_BIG_ENDIAN(= SWAP_LE32(*words))) + T; \
292 words++; \
293 a = rotl32(a, s); \
294 a += b; \
295 } while (0)
296
297 /* Round 1 */
Denys Vlasenko522041e2011-09-10 13:25:57 +0200298# if MD5_SMALL == 1
Denys Vlasenko302ad142010-10-19 02:16:12 +0200299 pc = C_array;
300 for (i = 0; i < 4; i++) {
301 OP(A, B, C, D, 7, *pc++);
302 OP(D, A, B, C, 12, *pc++);
303 OP(C, D, A, B, 17, *pc++);
304 OP(B, C, D, A, 22, *pc++);
305 }
306# else
307 OP(A, B, C, D, 7, 0xd76aa478);
308 OP(D, A, B, C, 12, 0xe8c7b756);
309 OP(C, D, A, B, 17, 0x242070db);
310 OP(B, C, D, A, 22, 0xc1bdceee);
311 OP(A, B, C, D, 7, 0xf57c0faf);
312 OP(D, A, B, C, 12, 0x4787c62a);
313 OP(C, D, A, B, 17, 0xa8304613);
314 OP(B, C, D, A, 22, 0xfd469501);
315 OP(A, B, C, D, 7, 0x698098d8);
316 OP(D, A, B, C, 12, 0x8b44f7af);
317 OP(C, D, A, B, 17, 0xffff5bb1);
318 OP(B, C, D, A, 22, 0x895cd7be);
319 OP(A, B, C, D, 7, 0x6b901122);
320 OP(D, A, B, C, 12, 0xfd987193);
321 OP(C, D, A, B, 17, 0xa679438e);
322 OP(B, C, D, A, 22, 0x49b40821);
323# endif
324 words -= 16;
325
326 /* For the second to fourth round we have the possibly swapped words
327 in WORDS. Redefine the macro to take an additional first
328 argument specifying the function to use. */
329# undef OP
330# define OP(f, a, b, c, d, k, s, T) \
331 do { \
332 a += f(b, c, d) + words[k] + T; \
333 a = rotl32(a, s); \
334 a += b; \
335 } while (0)
336
337 /* Round 2 */
Denys Vlasenko522041e2011-09-10 13:25:57 +0200338# if MD5_SMALL == 1
Denys Vlasenko302ad142010-10-19 02:16:12 +0200339 pp = P_array;
340 for (i = 0; i < 4; i++) {
341 OP(FG, A, B, C, D, (int) (*pp++), 5, *pc++);
342 OP(FG, D, A, B, C, (int) (*pp++), 9, *pc++);
343 OP(FG, C, D, A, B, (int) (*pp++), 14, *pc++);
344 OP(FG, B, C, D, A, (int) (*pp++), 20, *pc++);
345 }
346# else
347 OP(FG, A, B, C, D, 1, 5, 0xf61e2562);
348 OP(FG, D, A, B, C, 6, 9, 0xc040b340);
349 OP(FG, C, D, A, B, 11, 14, 0x265e5a51);
350 OP(FG, B, C, D, A, 0, 20, 0xe9b6c7aa);
351 OP(FG, A, B, C, D, 5, 5, 0xd62f105d);
352 OP(FG, D, A, B, C, 10, 9, 0x02441453);
353 OP(FG, C, D, A, B, 15, 14, 0xd8a1e681);
354 OP(FG, B, C, D, A, 4, 20, 0xe7d3fbc8);
355 OP(FG, A, B, C, D, 9, 5, 0x21e1cde6);
356 OP(FG, D, A, B, C, 14, 9, 0xc33707d6);
357 OP(FG, C, D, A, B, 3, 14, 0xf4d50d87);
358 OP(FG, B, C, D, A, 8, 20, 0x455a14ed);
359 OP(FG, A, B, C, D, 13, 5, 0xa9e3e905);
360 OP(FG, D, A, B, C, 2, 9, 0xfcefa3f8);
361 OP(FG, C, D, A, B, 7, 14, 0x676f02d9);
362 OP(FG, B, C, D, A, 12, 20, 0x8d2a4c8a);
363# endif
364
365 /* Round 3 */
Denys Vlasenko522041e2011-09-10 13:25:57 +0200366# if MD5_SMALL == 1
Denys Vlasenko302ad142010-10-19 02:16:12 +0200367 for (i = 0; i < 4; i++) {
368 OP(FH, A, B, C, D, (int) (*pp++), 4, *pc++);
369 OP(FH, D, A, B, C, (int) (*pp++), 11, *pc++);
370 OP(FH, C, D, A, B, (int) (*pp++), 16, *pc++);
371 OP(FH, B, C, D, A, (int) (*pp++), 23, *pc++);
372 }
373# else
374 OP(FH, A, B, C, D, 5, 4, 0xfffa3942);
375 OP(FH, D, A, B, C, 8, 11, 0x8771f681);
376 OP(FH, C, D, A, B, 11, 16, 0x6d9d6122);
377 OP(FH, B, C, D, A, 14, 23, 0xfde5380c);
378 OP(FH, A, B, C, D, 1, 4, 0xa4beea44);
379 OP(FH, D, A, B, C, 4, 11, 0x4bdecfa9);
380 OP(FH, C, D, A, B, 7, 16, 0xf6bb4b60);
381 OP(FH, B, C, D, A, 10, 23, 0xbebfbc70);
382 OP(FH, A, B, C, D, 13, 4, 0x289b7ec6);
383 OP(FH, D, A, B, C, 0, 11, 0xeaa127fa);
384 OP(FH, C, D, A, B, 3, 16, 0xd4ef3085);
385 OP(FH, B, C, D, A, 6, 23, 0x04881d05);
386 OP(FH, A, B, C, D, 9, 4, 0xd9d4d039);
387 OP(FH, D, A, B, C, 12, 11, 0xe6db99e5);
388 OP(FH, C, D, A, B, 15, 16, 0x1fa27cf8);
389 OP(FH, B, C, D, A, 2, 23, 0xc4ac5665);
390# endif
391
392 /* Round 4 */
Denys Vlasenko522041e2011-09-10 13:25:57 +0200393# if MD5_SMALL == 1
Denys Vlasenko302ad142010-10-19 02:16:12 +0200394 for (i = 0; i < 4; i++) {
395 OP(FI, A, B, C, D, (int) (*pp++), 6, *pc++);
396 OP(FI, D, A, B, C, (int) (*pp++), 10, *pc++);
397 OP(FI, C, D, A, B, (int) (*pp++), 15, *pc++);
398 OP(FI, B, C, D, A, (int) (*pp++), 21, *pc++);
399 }
400# else
401 OP(FI, A, B, C, D, 0, 6, 0xf4292244);
402 OP(FI, D, A, B, C, 7, 10, 0x432aff97);
403 OP(FI, C, D, A, B, 14, 15, 0xab9423a7);
404 OP(FI, B, C, D, A, 5, 21, 0xfc93a039);
405 OP(FI, A, B, C, D, 12, 6, 0x655b59c3);
406 OP(FI, D, A, B, C, 3, 10, 0x8f0ccc92);
407 OP(FI, C, D, A, B, 10, 15, 0xffeff47d);
408 OP(FI, B, C, D, A, 1, 21, 0x85845dd1);
409 OP(FI, A, B, C, D, 8, 6, 0x6fa87e4f);
410 OP(FI, D, A, B, C, 15, 10, 0xfe2ce6e0);
411 OP(FI, C, D, A, B, 6, 15, 0xa3014314);
412 OP(FI, B, C, D, A, 13, 21, 0x4e0811a1);
413 OP(FI, A, B, C, D, 4, 6, 0xf7537e82);
414 OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
415 OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
416 OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
Denys Vlasenko302ad142010-10-19 02:16:12 +0200417# endif
418 /* Add checksum to the starting values */
Denys Vlasenko305958d2015-10-07 19:17:01 +0200419 ctx->hash[0] += A;
420 ctx->hash[1] += B;
421 ctx->hash[2] += C;
422 ctx->hash[3] += D;
Denys Vlasenko302ad142010-10-19 02:16:12 +0200423#endif
424}
Denys Vlasenko25aadc82021-12-30 13:07:12 +0100425#undef OP
Denys Vlasenko302ad142010-10-19 02:16:12 +0200426#undef FF
427#undef FG
428#undef FH
429#undef FI
430
431/* Initialize structure containing state of computation.
432 * (RFC 1321, 3.3: Step 3)
433 */
434void FAST_FUNC md5_begin(md5_ctx_t *ctx)
435{
436 ctx->hash[0] = 0x67452301;
437 ctx->hash[1] = 0xefcdab89;
438 ctx->hash[2] = 0x98badcfe;
439 ctx->hash[3] = 0x10325476;
440 ctx->total64 = 0;
441 ctx->process_block = md5_process_block64;
442}
443
444/* Used also for sha1 and sha256 */
445void FAST_FUNC md5_hash(md5_ctx_t *ctx, const void *buffer, size_t len)
446{
Denys Vlasenkoabefc3c2020-09-30 22:22:04 +0200447 unsigned bufpos = ctx->total64 & 63;
448
449 ctx->total64 += len;
450
451 while (1) {
452 unsigned remaining = 64 - bufpos;
453 if (remaining > len)
454 remaining = len;
455 /* Copy data into aligned buffer */
456 memcpy(ctx->wbuffer + bufpos, buffer, remaining);
457 len -= remaining;
458 buffer = (const char *)buffer + remaining;
459 bufpos += remaining;
460
461 /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
462 bufpos -= 64;
463 if (bufpos != 0)
464 break;
465
466 /* Buffer is filled up, process it */
467 ctx->process_block(ctx);
468 /*bufpos = 0; - already is */
469 }
Denys Vlasenko302ad142010-10-19 02:16:12 +0200470}
471
472/* Process the remaining bytes in the buffer and put result from CTX
473 * in first 16 bytes following RESBUF. The result is always in little
474 * endian byte order, so that a byte-wise output yields to the wanted
475 * ASCII representation of the message digest.
476 */
Denys Vlasenko49ecee02017-01-24 16:00:54 +0100477unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
Denys Vlasenko302ad142010-10-19 02:16:12 +0200478{
479 /* MD5 stores total in LE, need to swap on BE arches: */
480 common64_end(ctx, /*swap_needed:*/ BB_BIG_ENDIAN);
481
Denys Vlasenko7ab94ca2010-10-19 02:33:39 +0200482 /* The MD5 result is in little endian byte order */
Denys Vlasenko5368fe52013-01-16 02:20:31 +0100483 if (BB_BIG_ENDIAN) {
484 ctx->hash[0] = SWAP_LE32(ctx->hash[0]);
485 ctx->hash[1] = SWAP_LE32(ctx->hash[1]);
486 ctx->hash[2] = SWAP_LE32(ctx->hash[2]);
487 ctx->hash[3] = SWAP_LE32(ctx->hash[3]);
488 }
489
Denys Vlasenko302ad142010-10-19 02:16:12 +0200490 memcpy(resbuf, ctx->hash, sizeof(ctx->hash[0]) * 4);
Denys Vlasenko49ecee02017-01-24 16:00:54 +0100491 return sizeof(ctx->hash[0]) * 4;
Denys Vlasenko302ad142010-10-19 02:16:12 +0200492}
493
494
495/*
Denys Vlasenkof4c93ab2010-10-24 19:27:30 +0200496 * SHA1 part is:
497 * Copyright 2007 Rob Landley <rob@landley.net>
Rob Landley5cf7c2d2006-02-21 06:44:43 +0000498 *
Denys Vlasenkof4c93ab2010-10-24 19:27:30 +0200499 * Based on the public domain SHA-1 in C by Steve Reid <steve@edmweb.com>
500 * from http://www.mirrors.wiretapped.net/security/cryptography/hashes/sha1/
Denis Vlasenko9213a9e2006-09-17 16:28:10 +0000501 *
Denys Vlasenkof4c93ab2010-10-24 19:27:30 +0200502 * Licensed under GPLv2, see file LICENSE in this source tree.
Denis Vlasenko56dceb92008-11-10 13:32:50 +0000503 *
504 * ---------------------------------------------------------------------------
505 *
506 * SHA256 and SHA512 parts are:
507 * Released into the Public Domain by Ulrich Drepper <drepper@redhat.com>.
Denis Vlasenkoddb1b852009-03-12 16:05:02 +0000508 * Shrank by Denys Vlasenko.
509 *
510 * ---------------------------------------------------------------------------
511 *
512 * The best way to test random blocksizes is to go to coreutils/md5_sha1_sum.c
513 * and replace "4096" with something like "2000 + time(NULL) % 2097",
514 * then rebuild and compare "shaNNNsum bigfile" results.
Rob Landley5cf7c2d2006-02-21 06:44:43 +0000515 */
516
Denys Vlasenko25aadc82021-12-30 13:07:12 +0100517#if CONFIG_SHA1_SMALL == 0
Denys Vlasenko5f681702022-01-01 12:21:01 +0100518# if defined(__GNUC__) && defined(__i386__)
519static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
520{
521 BUILD_BUG_ON(offsetof(sha1_ctx_t, hash) != 76);
522 asm(
523"\n\
524 pushl %ebp # \n\
525 pushl %edi # \n\
526 pushl %esi # \n\
527 pushl %ebx # \n\
528 pushl %eax \n\
529 movl $15, %edi \n\
5301: \n\
531 movl (%eax,%edi,4), %esi \n\
532 bswap %esi \n\
533 pushl %esi \n\
534 decl %edi \n\
535 jns 1b \n\
536 movl 80(%eax), %ebx # b = ctx->hash[1] \n\
537 movl 84(%eax), %ecx # c = ctx->hash[2] \n\
538 movl 88(%eax), %edx # d = ctx->hash[3] \n\
539 movl 92(%eax), %ebp # e = ctx->hash[4] \n\
540 movl 76(%eax), %eax # a = ctx->hash[0] \n\
541#Register and stack use: \n\
542# eax..edx: a..d \n\
543# ebp: e \n\
544# esi,edi: temps \n\
545# 4*n(%esp): W[n] \n\
546"
547#define RD1As(a,b,c,d,e, n, RCONST) \
548"\n\
549 ##movl 4*"n"(%esp), %esi # n=0, W[0] already in %esi \n\
550 movl "c", %edi # c \n\
551 xorl "d", %edi # ^d \n\
552 andl "b", %edi # &b \n\
553 xorl "d", %edi # (((c ^ d) & b) ^ d) \n\
554 leal "RCONST"("e",%esi), "e" # e += RCONST + W[n] \n\
555 addl %edi, "e" # e += (((c ^ d) & b) ^ d) \n\
556 movl "a", %esi # \n\
557 roll $5, %esi # rotl32(a,5) \n\
558 addl %esi, "e" # e += rotl32(a,5) \n\
559 rorl $2, "b" # b = rotl32(b,30) \n\
560"
561#define RD1Bs(a,b,c,d,e, n, RCONST) \
562"\n\
563 movl 4*"n"(%esp), %esi # W[n] \n\
564 movl "c", %edi # c \n\
565 xorl "d", %edi # ^d \n\
566 andl "b", %edi # &b \n\
567 xorl "d", %edi # (((c ^ d) & b) ^ d) \n\
568 leal "RCONST"("e",%esi), "e" # e += RCONST + W[n] \n\
569 addl %edi, "e" # e += (((c ^ d) & b) ^ d) \n\
570 movl "a", %esi # \n\
571 roll $5, %esi # rotl32(a,5) \n\
572 addl %esi, "e" # e += rotl32(a,5) \n\
573 rorl $2, "b" # b = rotl32(b,30) \n\
574"
575#define RD1Cs(a,b,c,d,e, n13,n8,n2,n, RCONST) \
576"\n\
577 movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
578 xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
579 xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
580 xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
581 roll %esi # \n\
582 movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\
583 movl "c", %edi # c \n\
584 xorl "d", %edi # ^d \n\
585 andl "b", %edi # &b \n\
586 xorl "d", %edi # (((c ^ d) & b) ^ d) \n\
587 leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
588 addl %edi, "e" # e += (((c ^ d) & b) ^ d) \n\
589 movl "a", %esi # \n\
590 roll $5, %esi # rotl32(a,5) \n\
591 addl %esi, "e" # e += rotl32(a,5) \n\
592 rorl $2, "b" # b = rotl32(b,30) \n\
593"
594#define RD1A(a,b,c,d,e, n) RD1As("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR((n)), STR(RCONST))
595#define RD1B(a,b,c,d,e, n) RD1Bs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR((n)), STR(RCONST))
596#define RD1C(a,b,c,d,e, n) RD1Cs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST))
597#undef RCONST
598#define RCONST 0x5A827999
599 RD1A(ax,bx,cx,dx,bp, 0) RD1B(bp,ax,bx,cx,dx, 1) RD1B(dx,bp,ax,bx,cx, 2) RD1B(cx,dx,bp,ax,bx, 3) RD1B(bx,cx,dx,bp,ax, 4)
600 RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1B(cx,dx,bp,ax,bx, 8) RD1B(bx,cx,dx,bp,ax, 9)
601 RD1B(ax,bx,cx,dx,bp,10) RD1B(bp,ax,bx,cx,dx,11) RD1B(dx,bp,ax,bx,cx,12) RD1B(cx,dx,bp,ax,bx,13) RD1B(bx,cx,dx,bp,ax,14)
602 RD1B(ax,bx,cx,dx,bp,15) RD1C(bp,ax,bx,cx,dx,16) RD1C(dx,bp,ax,bx,cx,17) RD1C(cx,dx,bp,ax,bx,18) RD1C(bx,cx,dx,bp,ax,19)
603#define RD2s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
604"\n\
605 movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
606 xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
607 xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
608 xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
609 roll %esi # \n\
610 movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\
611 movl "c", %edi # c \n\
612 xorl "d", %edi # ^d \n\
613 xorl "b", %edi # ^b \n\
614 leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
615 addl %edi, "e" # e += (c ^ d ^ b) \n\
616 movl "a", %esi # \n\
617 roll $5, %esi # rotl32(a,5) \n\
618 addl %esi, "e" # e += rotl32(a,5) \n\
619 rorl $2, "b" # b = rotl32(b,30) \n\
620"
621#define RD2(a,b,c,d,e, n) RD2s("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((20+n+13)&15)), STR(((20+n+8)&15)), STR(((20+n+2)&15)), STR(((20+n)&15)), STR(RCONST))
622#undef RCONST
623#define RCONST 0x6ED9EBA1
624 RD2(ax,bx,cx,dx,bp, 0) RD2(bp,ax,bx,cx,dx, 1) RD2(dx,bp,ax,bx,cx, 2) RD2(cx,dx,bp,ax,bx, 3) RD2(bx,cx,dx,bp,ax, 4)
625 RD2(ax,bx,cx,dx,bp, 5) RD2(bp,ax,bx,cx,dx, 6) RD2(dx,bp,ax,bx,cx, 7) RD2(cx,dx,bp,ax,bx, 8) RD2(bx,cx,dx,bp,ax, 9)
626 RD2(ax,bx,cx,dx,bp,10) RD2(bp,ax,bx,cx,dx,11) RD2(dx,bp,ax,bx,cx,12) RD2(cx,dx,bp,ax,bx,13) RD2(bx,cx,dx,bp,ax,14)
627 RD2(ax,bx,cx,dx,bp,15) RD2(bp,ax,bx,cx,dx,16) RD2(dx,bp,ax,bx,cx,17) RD2(cx,dx,bp,ax,bx,18) RD2(bx,cx,dx,bp,ax,19)
628
629#define RD3s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
630"\n\
631 movl "b", %edi # di: b \n\
632 movl "b", %esi # si: b \n\
633 orl "c", %edi # di: b | c \n\
634 andl "c", %esi # si: b & c \n\
635 andl "d", %edi # di: (b | c) & d \n\
636 orl %esi, %edi # ((b | c) & d) | (b & c) \n\
637 movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
638 xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
639 xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
640 xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
641 roll %esi # \n\
642 movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\
643 addl %edi, "e" # += ((b | c) & d) | (b & c)\n\
644 leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
645 movl "a", %esi # \n\
646 roll $5, %esi # rotl32(a,5) \n\
647 addl %esi, "e" # e += rotl32(a,5) \n\
648 rorl $2, "b" # b = rotl32(b,30) \n\
649"
650#define RD3(a,b,c,d,e, n) RD3s("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((40+n+13)&15)), STR(((40+n+8)&15)), STR(((40+n+2)&15)), STR(((40+n)&15)), STR(RCONST))
651#undef RCONST
652#define RCONST 0x8F1BBCDC
653 RD3(ax,bx,cx,dx,bp, 0) RD3(bp,ax,bx,cx,dx, 1) RD3(dx,bp,ax,bx,cx, 2) RD3(cx,dx,bp,ax,bx, 3) RD3(bx,cx,dx,bp,ax, 4)
654 RD3(ax,bx,cx,dx,bp, 5) RD3(bp,ax,bx,cx,dx, 6) RD3(dx,bp,ax,bx,cx, 7) RD3(cx,dx,bp,ax,bx, 8) RD3(bx,cx,dx,bp,ax, 9)
655 RD3(ax,bx,cx,dx,bp,10) RD3(bp,ax,bx,cx,dx,11) RD3(dx,bp,ax,bx,cx,12) RD3(cx,dx,bp,ax,bx,13) RD3(bx,cx,dx,bp,ax,14)
656 RD3(ax,bx,cx,dx,bp,15) RD3(bp,ax,bx,cx,dx,16) RD3(dx,bp,ax,bx,cx,17) RD3(cx,dx,bp,ax,bx,18) RD3(bx,cx,dx,bp,ax,19)
657
658#define RD4As(a,b,c,d,e, n13,n8,n2,n, RCONST) \
659"\n\
660 movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
661 xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
662 xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
663 xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
664 roll %esi # \n\
665 movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\
666 movl "c", %edi # c \n\
667 xorl "d", %edi # ^d \n\
668 xorl "b", %edi # ^b \n\
669 leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
670 addl %edi, "e" # e += (c ^ d ^ b) \n\
671 movl "a", %esi # \n\
672 roll $5, %esi # rotl32(a,5) \n\
673 addl %esi, "e" # e += rotl32(a,5) \n\
674 rorl $2, "b" # b = rotl32(b,30) \n\
675"
676#define RD4Bs(a,b,c,d,e, n13,n8,n2,n, RCONST) \
677"\n\
678 movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
679 xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
680 xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
681 xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
682 roll %esi # \n\
683 ##movl %esi, 4*"n"(%esp) # store to W[n & 15] elided \n\
684 movl "c", %edi # c \n\
685 xorl "d", %edi # ^d \n\
686 xorl "b", %edi # ^b \n\
687 leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
688 addl %edi, "e" # e += (c ^ d ^ b) \n\
689 movl "a", %esi # \n\
690 roll $5, %esi # rotl32(a,5) \n\
691 addl %esi, "e" # e += rotl32(a,5) \n\
692 rorl $2, "b" # b = rotl32(b,30) \n\
693"
694#define RD4A(a,b,c,d,e, n) RD4As("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST))
695#define RD4B(a,b,c,d,e, n) RD4Bs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST))
696#undef RCONST
697#define RCONST 0xCA62C1D6
698 RD4A(ax,bx,cx,dx,bp, 0) RD4A(bp,ax,bx,cx,dx, 1) RD4A(dx,bp,ax,bx,cx, 2) RD4A(cx,dx,bp,ax,bx, 3) RD4A(bx,cx,dx,bp,ax, 4)
699 RD4A(ax,bx,cx,dx,bp, 5) RD4A(bp,ax,bx,cx,dx, 6) RD4A(dx,bp,ax,bx,cx, 7) RD4A(cx,dx,bp,ax,bx, 8) RD4A(bx,cx,dx,bp,ax, 9)
700 RD4A(ax,bx,cx,dx,bp,10) RD4A(bp,ax,bx,cx,dx,11) RD4A(dx,bp,ax,bx,cx,12) RD4A(cx,dx,bp,ax,bx,13) RD4A(bx,cx,dx,bp,ax,14)
701 RD4A(ax,bx,cx,dx,bp,15) RD4A(bp,ax,bx,cx,dx,16) RD4B(dx,bp,ax,bx,cx,17) RD4B(cx,dx,bp,ax,bx,18) RD4B(bx,cx,dx,bp,ax,19)
702
703"\n\
704 movl 4*16(%esp), %esi # \n\
705 addl $4*(16+1), %esp # \n\
706 addl %eax, 76(%esi) # ctx->hash[0] += a \n\
707 addl %ebx, 80(%esi) # ctx->hash[1] += b \n\
708 addl %ecx, 84(%esi) # ctx->hash[2] += c \n\
709 addl %edx, 88(%esi) # ctx->hash[3] += d \n\
710 addl %ebp, 92(%esi) # ctx->hash[4] += e \n\
711 popl %ebx # \n\
712 popl %esi # \n\
713 popl %edi # \n\
714 popl %ebp # \n\
715"
716 ); /* asm */
717#undef RCONST
718}
719# elif defined(__GNUC__) && defined(__x86_64__)
Denys Vlasenko5f681702022-01-01 12:21:01 +0100720
Denys Vlasenko05fd13e2022-01-03 01:57:29 +0100721/* in hash_md5_sha_x86-64.S */
722struct ASM_expects_80 { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; };
Denys Vlasenko711e20e2022-01-07 00:43:59 +0100723void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx);
Denys Vlasenko5f681702022-01-01 12:21:01 +0100724
Denys Vlasenko5f681702022-01-01 12:21:01 +0100725# else
Denys Vlasenko25aadc82021-12-30 13:07:12 +0100726/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86.
727 * It seems further speedup can be achieved by handling more than
728 * 64 bytes per one function call (coreutils does that).
729 */
730static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
731{
732 static const uint32_t rconsts[] ALIGN4 = {
733 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
734 };
735 uint32_t W[16];
736 uint32_t a, b, c, d, e;
737
738 a = ctx->hash[0];
739 b = ctx->hash[1];
740 c = ctx->hash[2];
741 d = ctx->hash[3];
742 e = ctx->hash[4];
743
Denys Vlasenkof09d0882021-12-31 17:06:00 +0100744/* From kernel source comments:
745 * """
746 * If you have 32 registers or more, the compiler can (and should)
747 * try to change the array[] accesses into registers. However, on
748 * machines with less than ~25 registers, that won't really work,
749 * and at least gcc will make an unholy mess of it.
750 *
751 * So to avoid that mess which just slows things down, we force
752 * the stores to memory to actually happen (we might be better off
753 * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as
754 * suggested by Artur Skawina - that will also make gcc unable to
755 * try to do the silly "optimize away loads" part because it won't
756 * see what the value will be).
757 * """
758 */
Denys Vlasenko5c0c5582022-01-02 01:56:35 +0100759#if defined(__GNUC__) && defined(__i386__)
Denys Vlasenkof09d0882021-12-31 17:06:00 +0100760# define DO_NOT_TRY_PROPAGATING(m) asm("":"+m"(m))
761#else
762# define DO_NOT_TRY_PROPAGATING(m) ((void)0)
763#endif
764
Denys Vlasenko25aadc82021-12-30 13:07:12 +0100765#undef OP
766#define OP(A,B,C,D,E, n) \
767 do { \
768 uint32_t work = EXPR(B, C, D); \
769 if (n <= 15) \
Denys Vlasenko0b62a082021-12-30 18:54:02 +0100770 work += W[n & 15] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
Denys Vlasenko25aadc82021-12-30 13:07:12 +0100771 if (n >= 16) \
Denys Vlasenko0b62a082021-12-30 18:54:02 +0100772 work += W[n & 15] = rotl32(W[(n+13) & 15] ^ W[(n+8) & 15] ^ W[(n+2) & 15] ^ W[n & 15], 1); \
Denys Vlasenkof09d0882021-12-31 17:06:00 +0100773 DO_NOT_TRY_PROPAGATING(W[n & 15]); \
Denys Vlasenko25aadc82021-12-30 13:07:12 +0100774 E += work + rotl32(A, 5) + rconsts[n / 20]; \
775 B = rotl32(B, 30); \
776 } while (0)
777#define OP20(n) \
778 OP(a,b,c,d,e, (n+ 0)); OP(e,a,b,c,d, (n+ 1)); OP(d,e,a,b,c, (n+ 2)); OP(c,d,e,a,b, (n+ 3)); OP(b,c,d,e,a, (n+ 4)); \
779 OP(a,b,c,d,e, (n+ 5)); OP(e,a,b,c,d, (n+ 6)); OP(d,e,a,b,c, (n+ 7)); OP(c,d,e,a,b, (n+ 8)); OP(b,c,d,e,a, (n+ 9)); \
780 OP(a,b,c,d,e, (n+10)); OP(e,a,b,c,d, (n+11)); OP(d,e,a,b,c, (n+12)); OP(c,d,e,a,b, (n+13)); OP(b,c,d,e,a, (n+14)); \
781 OP(a,b,c,d,e, (n+15)); OP(e,a,b,c,d, (n+16)); OP(d,e,a,b,c, (n+17)); OP(c,d,e,a,b, (n+18)); OP(b,c,d,e,a, (n+19))
782
783 /* 4 rounds of 20 operations each */
784#define EXPR(b,c,d) (((c ^ d) & b) ^ d)
785 OP20(0);
786#undef EXPR
787#define EXPR(b,c,d) (c ^ d ^ b)
788 OP20(20);
789#undef EXPR
790#define EXPR(b,c,d) (((b | c) & d) | (b & c))
791 OP20(40);
792#undef EXPR
793#define EXPR(b,c,d) (c ^ d ^ b)
794 OP20(60);
795
796#undef EXPR
797#undef OP
798#undef OP20
799
800 ctx->hash[0] += a;
801 ctx->hash[1] += b;
802 ctx->hash[2] += c;
803 ctx->hash[3] += d;
804 ctx->hash[4] += e;
805}
Denys Vlasenko5f681702022-01-01 12:21:01 +0100806# endif
Denys Vlasenko0b62a082021-12-30 18:54:02 +0100807#elif CONFIG_SHA1_SMALL == 1
808/* Middle-sized version, +300 bytes of code on x86. */
809static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
810{
811 static const uint32_t rconsts[] ALIGN4 = {
812 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
813 };
814 int j;
815 int n;
816 uint32_t W[16+16];
817 uint32_t a, b, c, d, e;
Denys Vlasenko25aadc82021-12-30 13:07:12 +0100818
Denys Vlasenko0b62a082021-12-30 18:54:02 +0100819 a = ctx->hash[0];
820 b = ctx->hash[1];
821 c = ctx->hash[2];
822 d = ctx->hash[3];
823 e = ctx->hash[4];
824
825 /* 1st round of 20 operations */
826 n = 0;
827 do {
828 uint32_t work = ((c ^ d) & b) ^ d;
829 W[n] = W[n+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]);
830 work += W[n];
831 work += e + rotl32(a, 5) + rconsts[0];
832 /* Rotate by one for next time */
833 e = d;
834 d = c;
835 c = rotl32(b, 30);
836 b = a;
837 a = work;
838 n = (n + 1) & 15;
839 } while (n != 0);
840 do {
841 uint32_t work = ((c ^ d) & b) ^ d;
842 W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
843 work += W[n];
844 work += e + rotl32(a, 5) + rconsts[0];
845 e = d;
846 d = c;
847 c = rotl32(b, 30);
848 b = a;
849 a = work;
Denys Vlasenko5c0c5582022-01-02 01:56:35 +0100850 n = (n + 1) /* & 15*/;
Denys Vlasenko0b62a082021-12-30 18:54:02 +0100851 } while (n != 4);
852 /* 2nd round of 20 operations */
853 j = 19;
854 do {
855 uint32_t work = c ^ d ^ b;
856 W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
857 work += W[n];
858 work += e + rotl32(a, 5) + rconsts[1];
859 e = d;
860 d = c;
861 c = rotl32(b, 30);
862 b = a;
863 a = work;
864 n = (n + 1) & 15;
865 } while (--j >= 0);
866 /* 3rd round */
867 j = 19;
868 do {
869 uint32_t work = ((b | c) & d) | (b & c);
870 W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
871 work += W[n];
872 work += e + rotl32(a, 5) + rconsts[2];
873 e = d;
874 d = c;
875 c = rotl32(b, 30);
876 b = a;
877 a = work;
878 n = (n + 1) & 15;
879 } while (--j >= 0);
880 /* 4th round */
881 j = 19;
882 do {
883 uint32_t work = c ^ d ^ b;
884 W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
885 work += W[n];
886 work += e + rotl32(a, 5) + rconsts[3];
887 e = d;
888 d = c;
889 c = rotl32(b, 30);
890 b = a;
891 a = work;
892 n = (n + 1) & 15;
893 } while (--j >= 0);
894
895 ctx->hash[0] += a;
896 ctx->hash[1] += b;
897 ctx->hash[2] += c;
898 ctx->hash[3] += d;
899 ctx->hash[4] += e;
900}
901#else
Denys Vlasenko25aadc82021-12-30 13:07:12 +0100902/* Compact version, almost twice as slow as fully unrolled */
Denis Vlasenko8ec8d5e2009-03-15 02:56:00 +0000903static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
Rob Landley5cf7c2d2006-02-21 06:44:43 +0000904{
Denys Vlasenko965b7952020-11-30 13:03:03 +0100905 static const uint32_t rconsts[] ALIGN4 = {
Denys Vlasenkof4c93ab2010-10-24 19:27:30 +0200906 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
907 };
908 int i, j;
Denys Vlasenko25aadc82021-12-30 13:07:12 +0100909 int n;
Denys Vlasenkof4c93ab2010-10-24 19:27:30 +0200910 uint32_t W[16+16];
911 uint32_t a, b, c, d, e;
Rob Landley5cf7c2d2006-02-21 06:44:43 +0000912
Denys Vlasenkof4c93ab2010-10-24 19:27:30 +0200913 /* On-stack work buffer frees up one register in the main loop
Denys Vlasenko25aadc82021-12-30 13:07:12 +0100914 * which otherwise will be needed to hold ctx pointer.
915 *
916 * The compiler is not smart enough to realize it, though. :(
917 * If __attribute__((optimize("2"))) is added to the function,
918 * only then gcc-9.3.1 spills "ctx" to stack and uses the freed
919 * register (making code 6 bytes smaller, not just faster).
920 */
Denys Vlasenkof4c93ab2010-10-24 19:27:30 +0200921 for (i = 0; i < 16; i++)
922 W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
Rob Landley5cf7c2d2006-02-21 06:44:43 +0000923
924 a = ctx->hash[0];
925 b = ctx->hash[1];
926 c = ctx->hash[2];
927 d = ctx->hash[3];
928 e = ctx->hash[4];
929
Denys Vlasenkof4c93ab2010-10-24 19:27:30 +0200930 /* 4 rounds of 20 operations each */
Denys Vlasenko25aadc82021-12-30 13:07:12 +0100931 n = 0;
Denys Vlasenkof4c93ab2010-10-24 19:27:30 +0200932 for (i = 0; i < 4; i++) {
933 j = 19;
934 do {
935 uint32_t work;
Denis Vlasenko56dceb92008-11-10 13:32:50 +0000936
Denys Vlasenkof4c93ab2010-10-24 19:27:30 +0200937 work = c ^ d;
938 if (i == 0) {
939 work = (work & b) ^ d;
940 if (j <= 3)
941 goto ge16;
Denys Vlasenkof4c93ab2010-10-24 19:27:30 +0200942 } else {
943 if (i == 2)
944 work = ((b | c) & d) | (b & c);
945 else /* i = 1 or 3 */
946 work ^= b;
947 ge16:
Denys Vlasenko25aadc82021-12-30 13:07:12 +0100948 W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
Denys Vlasenkof4c93ab2010-10-24 19:27:30 +0200949 }
Denys Vlasenko25aadc82021-12-30 13:07:12 +0100950 work += W[n];
Denys Vlasenko03a5fe32010-10-24 20:51:28 +0200951 work += e + rotl32(a, 5) + rconsts[i];
Rob Landley5cf7c2d2006-02-21 06:44:43 +0000952
Denys Vlasenkof4c93ab2010-10-24 19:27:30 +0200953 /* Rotate by one for next time */
954 e = d;
955 d = c;
Denys Vlasenkof1d06462021-12-28 09:05:12 +0100956 c = rotl32(b, 30);
Denys Vlasenkof4c93ab2010-10-24 19:27:30 +0200957 b = a;
958 a = work;
Denys Vlasenko25aadc82021-12-30 13:07:12 +0100959 n = (n + 1) & 15;
Denys Vlasenkof4c93ab2010-10-24 19:27:30 +0200960 } while (--j >= 0);
961 }
Rob Landley5cf7c2d2006-02-21 06:44:43 +0000962
963 ctx->hash[0] += a;
964 ctx->hash[1] += b;
965 ctx->hash[2] += c;
966 ctx->hash[3] += d;
967 ctx->hash[4] += e;
968}
Denys Vlasenko25aadc82021-12-30 13:07:12 +0100969#endif
Rob Landley5cf7c2d2006-02-21 06:44:43 +0000970
Denis Vlasenko823f10b2009-03-15 04:56:51 +0000971/* Constants for SHA512 from FIPS 180-2:4.2.3.
972 * SHA256 constants from FIPS 180-2:4.2.2
973 * are the most significant half of first 64 elements
974 * of the same array.
975 */
Denys Vlasenkob8935d02017-01-15 20:16:27 +0100976#undef K
977#if NEED_SHA512
978typedef uint64_t sha_K_int;
979# define K(v) v
980#else
981typedef uint32_t sha_K_int;
982# define K(v) (uint32_t)(v >> 32)
983#endif
Denys Vlasenko965b7952020-11-30 13:03:03 +0100984static const sha_K_int sha_K[] ALIGN8 = {
Denys Vlasenkob8935d02017-01-15 20:16:27 +0100985 K(0x428a2f98d728ae22ULL), K(0x7137449123ef65cdULL),
986 K(0xb5c0fbcfec4d3b2fULL), K(0xe9b5dba58189dbbcULL),
987 K(0x3956c25bf348b538ULL), K(0x59f111f1b605d019ULL),
988 K(0x923f82a4af194f9bULL), K(0xab1c5ed5da6d8118ULL),
989 K(0xd807aa98a3030242ULL), K(0x12835b0145706fbeULL),
990 K(0x243185be4ee4b28cULL), K(0x550c7dc3d5ffb4e2ULL),
991 K(0x72be5d74f27b896fULL), K(0x80deb1fe3b1696b1ULL),
992 K(0x9bdc06a725c71235ULL), K(0xc19bf174cf692694ULL),
993 K(0xe49b69c19ef14ad2ULL), K(0xefbe4786384f25e3ULL),
994 K(0x0fc19dc68b8cd5b5ULL), K(0x240ca1cc77ac9c65ULL),
995 K(0x2de92c6f592b0275ULL), K(0x4a7484aa6ea6e483ULL),
996 K(0x5cb0a9dcbd41fbd4ULL), K(0x76f988da831153b5ULL),
997 K(0x983e5152ee66dfabULL), K(0xa831c66d2db43210ULL),
998 K(0xb00327c898fb213fULL), K(0xbf597fc7beef0ee4ULL),
999 K(0xc6e00bf33da88fc2ULL), K(0xd5a79147930aa725ULL),
1000 K(0x06ca6351e003826fULL), K(0x142929670a0e6e70ULL),
1001 K(0x27b70a8546d22ffcULL), K(0x2e1b21385c26c926ULL),
1002 K(0x4d2c6dfc5ac42aedULL), K(0x53380d139d95b3dfULL),
1003 K(0x650a73548baf63deULL), K(0x766a0abb3c77b2a8ULL),
1004 K(0x81c2c92e47edaee6ULL), K(0x92722c851482353bULL),
1005 K(0xa2bfe8a14cf10364ULL), K(0xa81a664bbc423001ULL),
1006 K(0xc24b8b70d0f89791ULL), K(0xc76c51a30654be30ULL),
1007 K(0xd192e819d6ef5218ULL), K(0xd69906245565a910ULL),
1008 K(0xf40e35855771202aULL), K(0x106aa07032bbd1b8ULL),
1009 K(0x19a4c116b8d2d0c8ULL), K(0x1e376c085141ab53ULL),
1010 K(0x2748774cdf8eeb99ULL), K(0x34b0bcb5e19b48a8ULL),
1011 K(0x391c0cb3c5c95a63ULL), K(0x4ed8aa4ae3418acbULL),
1012 K(0x5b9cca4f7763e373ULL), K(0x682e6ff3d6b2b8a3ULL),
1013 K(0x748f82ee5defb2fcULL), K(0x78a5636f43172f60ULL),
1014 K(0x84c87814a1f0ab72ULL), K(0x8cc702081a6439ecULL),
1015 K(0x90befffa23631e28ULL), K(0xa4506cebde82bde9ULL),
1016 K(0xbef9a3f7b2c67915ULL), K(0xc67178f2e372532bULL),
1017#if NEED_SHA512 /* [64]+ are used for sha512 only */
1018 K(0xca273eceea26619cULL), K(0xd186b8c721c0c207ULL),
1019 K(0xeada7dd6cde0eb1eULL), K(0xf57d4f7fee6ed178ULL),
1020 K(0x06f067aa72176fbaULL), K(0x0a637dc5a2c898a6ULL),
1021 K(0x113f9804bef90daeULL), K(0x1b710b35131c471bULL),
1022 K(0x28db77f523047d84ULL), K(0x32caab7b40c72493ULL),
1023 K(0x3c9ebe0a15c9bebcULL), K(0x431d67c49c100d4cULL),
1024 K(0x4cc5d4becb3e42b6ULL), K(0x597f299cfc657e2aULL),
1025 K(0x5fcb6fab3ad6faecULL), K(0x6c44198c4a475817ULL),
1026#endif
Denis Vlasenko98c87f72009-03-11 21:15:51 +00001027};
Denys Vlasenkob8935d02017-01-15 20:16:27 +01001028#undef K
Denis Vlasenko98c87f72009-03-11 21:15:51 +00001029
Denys Vlasenkofe4ef362009-07-05 20:34:38 +02001030#undef Ch
1031#undef Maj
1032#undef S0
1033#undef S1
1034#undef R0
1035#undef R1
1036
Denis Vlasenko8ec8d5e2009-03-15 02:56:00 +00001037static void FAST_FUNC sha256_process_block64(sha256_ctx_t *ctx)
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001038{
Denis Vlasenkoe9afc462009-03-15 02:28:05 +00001039 unsigned t;
Denis Vlasenko823f10b2009-03-15 04:56:51 +00001040 uint32_t W[64], a, b, c, d, e, f, g, h;
Denis Vlasenkoe9afc462009-03-15 02:28:05 +00001041 const uint32_t *words = (uint32_t*) ctx->wbuffer;
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001042
Denis Vlasenkoe9afc462009-03-15 02:28:05 +00001043 /* Operators defined in FIPS 180-2:4.1.2. */
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001044#define Ch(x, y, z) ((x & y) ^ (~x & z))
1045#define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
1046#define S0(x) (rotr32(x, 2) ^ rotr32(x, 13) ^ rotr32(x, 22))
1047#define S1(x) (rotr32(x, 6) ^ rotr32(x, 11) ^ rotr32(x, 25))
1048#define R0(x) (rotr32(x, 7) ^ rotr32(x, 18) ^ (x >> 3))
1049#define R1(x) (rotr32(x, 17) ^ rotr32(x, 19) ^ (x >> 10))
1050
Denis Vlasenkoe9afc462009-03-15 02:28:05 +00001051 /* Compute the message schedule according to FIPS 180-2:6.2.2 step 2. */
Denys Vlasenko4bc3b852010-10-16 23:31:15 +02001052 for (t = 0; t < 16; ++t)
Denys Vlasenkob102e122010-10-18 11:39:47 +02001053 W[t] = SWAP_BE32(words[t]);
Denis Vlasenkoe9afc462009-03-15 02:28:05 +00001054 for (/*t = 16*/; t < 64; ++t)
1055 W[t] = R1(W[t - 2]) + W[t - 7] + R0(W[t - 15]) + W[t - 16];
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001056
Denis Vlasenko823f10b2009-03-15 04:56:51 +00001057 a = ctx->hash[0];
1058 b = ctx->hash[1];
1059 c = ctx->hash[2];
1060 d = ctx->hash[3];
1061 e = ctx->hash[4];
1062 f = ctx->hash[5];
1063 g = ctx->hash[6];
1064 h = ctx->hash[7];
1065
Denis Vlasenkoe9afc462009-03-15 02:28:05 +00001066 /* The actual computation according to FIPS 180-2:6.2.2 step 3. */
1067 for (t = 0; t < 64; ++t) {
Denis Vlasenkoa2333c82009-03-28 19:08:23 +00001068 /* Need to fetch upper half of sha_K[t]
1069 * (I hope compiler is clever enough to just fetch
1070 * upper half)
1071 */
Denys Vlasenkob8935d02017-01-15 20:16:27 +01001072 uint32_t K_t = NEED_SHA512 ? (sha_K[t] >> 32) : sha_K[t];
Denis Vlasenko823f10b2009-03-15 04:56:51 +00001073 uint32_t T1 = h + S1(e) + Ch(e, f, g) + K_t + W[t];
Denis Vlasenkoe9afc462009-03-15 02:28:05 +00001074 uint32_t T2 = S0(a) + Maj(a, b, c);
1075 h = g;
1076 g = f;
1077 f = e;
1078 e = d + T1;
1079 d = c;
1080 c = b;
1081 b = a;
1082 a = T1 + T2;
1083 }
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001084#undef Ch
1085#undef Maj
1086#undef S0
1087#undef S1
1088#undef R0
1089#undef R1
Denis Vlasenkoe9afc462009-03-15 02:28:05 +00001090 /* Add the starting values of the context according to FIPS 180-2:6.2.2
1091 step 4. */
1092 ctx->hash[0] += a;
1093 ctx->hash[1] += b;
1094 ctx->hash[2] += c;
1095 ctx->hash[3] += d;
1096 ctx->hash[4] += e;
1097 ctx->hash[5] += f;
1098 ctx->hash[6] += g;
1099 ctx->hash[7] += h;
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001100}
Denis Vlasenko823f10b2009-03-15 04:56:51 +00001101
Denys Vlasenkob8935d02017-01-15 20:16:27 +01001102#if NEED_SHA512
Denis Vlasenko8ec8d5e2009-03-15 02:56:00 +00001103static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx)
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001104{
Denis Vlasenkoe9afc462009-03-15 02:28:05 +00001105 unsigned t;
1106 uint64_t W[80];
Denis Vlasenko823f10b2009-03-15 04:56:51 +00001107 /* On i386, having assignments here (not later as sha256 does)
1108 * produces 99 bytes smaller code with gcc 4.3.1
1109 */
Denis Vlasenkocd2cd312009-03-12 15:40:27 +00001110 uint64_t a = ctx->hash[0];
1111 uint64_t b = ctx->hash[1];
1112 uint64_t c = ctx->hash[2];
1113 uint64_t d = ctx->hash[3];
1114 uint64_t e = ctx->hash[4];
1115 uint64_t f = ctx->hash[5];
1116 uint64_t g = ctx->hash[6];
1117 uint64_t h = ctx->hash[7];
Denis Vlasenko8ec8d5e2009-03-15 02:56:00 +00001118 const uint64_t *words = (uint64_t*) ctx->wbuffer;
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001119
Denis Vlasenkoe9afc462009-03-15 02:28:05 +00001120 /* Operators defined in FIPS 180-2:4.1.2. */
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001121#define Ch(x, y, z) ((x & y) ^ (~x & z))
1122#define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
1123#define S0(x) (rotr64(x, 28) ^ rotr64(x, 34) ^ rotr64(x, 39))
1124#define S1(x) (rotr64(x, 14) ^ rotr64(x, 18) ^ rotr64(x, 41))
1125#define R0(x) (rotr64(x, 1) ^ rotr64(x, 8) ^ (x >> 7))
1126#define R1(x) (rotr64(x, 19) ^ rotr64(x, 61) ^ (x >> 6))
1127
Denis Vlasenkoe9afc462009-03-15 02:28:05 +00001128 /* Compute the message schedule according to FIPS 180-2:6.3.2 step 2. */
Denys Vlasenko4bc3b852010-10-16 23:31:15 +02001129 for (t = 0; t < 16; ++t)
Denys Vlasenko9ff50b82010-10-18 11:40:26 +02001130 W[t] = SWAP_BE64(words[t]);
Denis Vlasenkoe9afc462009-03-15 02:28:05 +00001131 for (/*t = 16*/; t < 80; ++t)
1132 W[t] = R1(W[t - 2]) + W[t - 7] + R0(W[t - 15]) + W[t - 16];
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001133
Denis Vlasenkoe9afc462009-03-15 02:28:05 +00001134 /* The actual computation according to FIPS 180-2:6.3.2 step 3. */
1135 for (t = 0; t < 80; ++t) {
Denis Vlasenko823f10b2009-03-15 04:56:51 +00001136 uint64_t T1 = h + S1(e) + Ch(e, f, g) + sha_K[t] + W[t];
Denis Vlasenkoe9afc462009-03-15 02:28:05 +00001137 uint64_t T2 = S0(a) + Maj(a, b, c);
1138 h = g;
1139 g = f;
1140 f = e;
1141 e = d + T1;
1142 d = c;
1143 c = b;
1144 b = a;
1145 a = T1 + T2;
1146 }
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001147#undef Ch
1148#undef Maj
1149#undef S0
1150#undef S1
1151#undef R0
1152#undef R1
Denis Vlasenkoe9afc462009-03-15 02:28:05 +00001153 /* Add the starting values of the context according to FIPS 180-2:6.3.2
1154 step 4. */
1155 ctx->hash[0] += a;
1156 ctx->hash[1] += b;
1157 ctx->hash[2] += c;
1158 ctx->hash[3] += d;
1159 ctx->hash[4] += e;
1160 ctx->hash[5] += f;
1161 ctx->hash[6] += g;
1162 ctx->hash[7] += h;
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001163}
Denys Vlasenkob8935d02017-01-15 20:16:27 +01001164#endif /* NEED_SHA512 */
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001165
Denis Vlasenkodefc1ea2008-06-27 02:52:20 +00001166void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
Rob Landley5cf7c2d2006-02-21 06:44:43 +00001167{
Rob Landley5cf7c2d2006-02-21 06:44:43 +00001168 ctx->hash[0] = 0x67452301;
1169 ctx->hash[1] = 0xefcdab89;
1170 ctx->hash[2] = 0x98badcfe;
1171 ctx->hash[3] = 0x10325476;
1172 ctx->hash[4] = 0xc3d2e1f0;
Denis Vlasenkoe9afc462009-03-15 02:28:05 +00001173 ctx->total64 = 0;
1174 ctx->process_block = sha1_process_block64;
Denys Vlasenko711e20e2022-01-07 00:43:59 +01001175#if ENABLE_SHA1_HWACCEL
Denys Vlasenkoa96ccbe2022-01-07 01:32:13 +01001176# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
Denys Vlasenko711e20e2022-01-07 00:43:59 +01001177 {
Denys Vlasenko711e20e2022-01-07 00:43:59 +01001178 if (!shaNI) {
1179 unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx;
1180 cpuid(&eax, &ebx, &ecx, &edx);
Denys Vlasenkoe7ff2942022-01-08 01:25:23 +01001181 shaNI = ((ebx >> 29) << 1) - 1;
Denys Vlasenko711e20e2022-01-07 00:43:59 +01001182 }
1183 if (shaNI > 0)
1184 ctx->process_block = sha1_process_block64_shaNI;
1185 }
1186# endif
1187#endif
Rob Landley5cf7c2d2006-02-21 06:44:43 +00001188}
1189
Denys Vlasenko965b7952020-11-30 13:03:03 +01001190static const uint32_t init256[] ALIGN4 = {
Denys Vlasenkoc48a5c62010-10-18 14:48:30 +02001191 0,
1192 0,
Denis Vlasenko98c87f72009-03-11 21:15:51 +00001193 0x6a09e667,
1194 0xbb67ae85,
1195 0x3c6ef372,
1196 0xa54ff53a,
1197 0x510e527f,
1198 0x9b05688c,
1199 0x1f83d9ab,
Denys Vlasenkoa971a192010-10-17 01:35:16 +02001200 0x5be0cd19,
Denis Vlasenko98c87f72009-03-11 21:15:51 +00001201};
Denys Vlasenkob8935d02017-01-15 20:16:27 +01001202#if NEED_SHA512
Denys Vlasenko965b7952020-11-30 13:03:03 +01001203static const uint32_t init512_lo[] ALIGN4 = {
Denys Vlasenkoc48a5c62010-10-18 14:48:30 +02001204 0,
1205 0,
Denis Vlasenko98c87f72009-03-11 21:15:51 +00001206 0xf3bcc908,
1207 0x84caa73b,
1208 0xfe94f82b,
1209 0x5f1d36f1,
1210 0xade682d1,
1211 0x2b3e6c1f,
1212 0xfb41bd6b,
Denys Vlasenkoa971a192010-10-17 01:35:16 +02001213 0x137e2179,
Denis Vlasenko98c87f72009-03-11 21:15:51 +00001214};
Denys Vlasenkob8935d02017-01-15 20:16:27 +01001215#endif /* NEED_SHA512 */
Denis Vlasenko823f10b2009-03-15 04:56:51 +00001216
Denys Vlasenkof69f2072018-11-26 13:00:28 +01001217// Note: SHA-384 is identical to SHA-512, except that initial hash values are
1218// 0xcbbb9d5dc1059ed8, 0x629a292a367cd507, 0x9159015a3070dd17, 0x152fecd8f70e5939,
1219// 0x67332667ffc00b31, 0x8eb44a8768581511, 0xdb0c2e0d64f98fa7, 0x47b5481dbefa4fa4,
1220// and the output is constructed by omitting last two 64-bit words of it.
1221
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001222/* Initialize structure containing state of computation.
1223 (FIPS 180-2:5.3.2) */
1224void FAST_FUNC sha256_begin(sha256_ctx_t *ctx)
1225{
Denys Vlasenkoc48a5c62010-10-18 14:48:30 +02001226 memcpy(&ctx->total64, init256, sizeof(init256));
1227 /*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */
Denis Vlasenkoe9afc462009-03-15 02:28:05 +00001228 ctx->process_block = sha256_process_block64;
Denys Vlasenko6472ac92022-02-03 14:15:20 +01001229#if ENABLE_SHA256_HWACCEL
1230# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
1231 {
1232 if (!shaNI) {
1233 unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx;
1234 cpuid(&eax, &ebx, &ecx, &edx);
1235 shaNI = ((ebx >> 29) << 1) - 1;
1236 }
1237 if (shaNI > 0)
1238 ctx->process_block = sha256_process_block64_shaNI;
1239 }
1240# endif
1241#endif
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001242}
Denis Vlasenko823f10b2009-03-15 04:56:51 +00001243
Denys Vlasenkob8935d02017-01-15 20:16:27 +01001244#if NEED_SHA512
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001245/* Initialize structure containing state of computation.
1246 (FIPS 180-2:5.3.3) */
1247void FAST_FUNC sha512_begin(sha512_ctx_t *ctx)
1248{
Denis Vlasenko98c87f72009-03-11 21:15:51 +00001249 int i;
Denys Vlasenkoc48a5c62010-10-18 14:48:30 +02001250 /* Two extra iterations zero out ctx->total64[2] */
1251 uint64_t *tp = ctx->total64;
Denys Vlasenkoabefc3c2020-09-30 22:22:04 +02001252 for (i = 0; i < 8 + 2; i++)
Denys Vlasenkoc48a5c62010-10-18 14:48:30 +02001253 tp[i] = ((uint64_t)(init256[i]) << 32) + init512_lo[i];
Denys Vlasenkoa971a192010-10-17 01:35:16 +02001254 /*ctx->total64[0] = ctx->total64[1] = 0; - already done */
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001255}
1256
Denys Vlasenkoc0683ac2010-10-16 20:45:27 +02001257void FAST_FUNC sha512_hash(sha512_ctx_t *ctx, const void *buffer, size_t len)
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001258{
Denys Vlasenko273abcb2010-10-16 22:43:34 +02001259 unsigned bufpos = ctx->total64[0] & 127;
Denys Vlasenko4bc3b852010-10-16 23:31:15 +02001260 unsigned remaining;
Denis Vlasenkocd2cd312009-03-12 15:40:27 +00001261
1262 /* First increment the byte count. FIPS 180-2 specifies the possible
1263 length of the file up to 2^128 _bits_.
1264 We compute the number of _bytes_ and convert to bits later. */
1265 ctx->total64[0] += len;
1266 if (ctx->total64[0] < len)
1267 ctx->total64[1]++;
1268
Denys Vlasenko273abcb2010-10-16 22:43:34 +02001269 while (1) {
Denys Vlasenko4bc3b852010-10-16 23:31:15 +02001270 remaining = 128 - bufpos;
Denys Vlasenko273abcb2010-10-16 22:43:34 +02001271 if (remaining > len)
1272 remaining = len;
Denys Vlasenko4bc3b852010-10-16 23:31:15 +02001273 /* Copy data into aligned buffer */
Denys Vlasenko273abcb2010-10-16 22:43:34 +02001274 memcpy(ctx->wbuffer + bufpos, buffer, remaining);
1275 len -= remaining;
1276 buffer = (const char *)buffer + remaining;
1277 bufpos += remaining;
Denys Vlasenkoabefc3c2020-09-30 22:22:04 +02001278
Denys Vlasenko2cfcc9e2013-01-20 00:38:09 +01001279 /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
Denys Vlasenko273abcb2010-10-16 22:43:34 +02001280 bufpos -= 128;
1281 if (bufpos != 0)
1282 break;
Denys Vlasenkoabefc3c2020-09-30 22:22:04 +02001283
Denys Vlasenko4bc3b852010-10-16 23:31:15 +02001284 /* Buffer is filled up, process it */
Denis Vlasenkoe9afc462009-03-15 02:28:05 +00001285 sha512_process_block128(ctx);
Denys Vlasenko273abcb2010-10-16 22:43:34 +02001286 /*bufpos = 0; - already is */
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001287 }
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001288}
Denys Vlasenkob8935d02017-01-15 20:16:27 +01001289#endif /* NEED_SHA512 */
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001290
Denis Vlasenko823f10b2009-03-15 04:56:51 +00001291/* Used also for sha256 */
Denys Vlasenko49ecee02017-01-24 16:00:54 +01001292unsigned FAST_FUNC sha1_end(sha1_ctx_t *ctx, void *resbuf)
Rob Landley5cf7c2d2006-02-21 06:44:43 +00001293{
Denys Vlasenkoc48a5c62010-10-18 14:48:30 +02001294 unsigned hash_size;
Rob Landley5cf7c2d2006-02-21 06:44:43 +00001295
Denys Vlasenkoc48a5c62010-10-18 14:48:30 +02001296 /* SHA stores total in BE, need to swap on LE arches: */
Denys Vlasenko302ad142010-10-19 02:16:12 +02001297 common64_end(ctx, /*swap_needed:*/ BB_LITTLE_ENDIAN);
Rob Landley5cf7c2d2006-02-21 06:44:43 +00001298
Denys Vlasenkoc48a5c62010-10-18 14:48:30 +02001299 hash_size = (ctx->process_block == sha1_process_block64) ? 5 : 8;
Denis Vlasenkoc8329c92009-03-12 19:06:18 +00001300 /* This way we do not impose alignment constraints on resbuf: */
Denys Vlasenko245a4f82009-11-07 01:31:14 +01001301 if (BB_LITTLE_ENDIAN) {
1302 unsigned i;
Denys Vlasenkoc48a5c62010-10-18 14:48:30 +02001303 for (i = 0; i < hash_size; ++i)
Denys Vlasenkob102e122010-10-18 11:39:47 +02001304 ctx->hash[i] = SWAP_BE32(ctx->hash[i]);
Denys Vlasenko245a4f82009-11-07 01:31:14 +01001305 }
Denys Vlasenko49ecee02017-01-24 16:00:54 +01001306 hash_size *= sizeof(ctx->hash[0]);
1307 memcpy(resbuf, ctx->hash, hash_size);
1308 return hash_size;
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001309}
1310
Denys Vlasenkob8935d02017-01-15 20:16:27 +01001311#if NEED_SHA512
Denys Vlasenko49ecee02017-01-24 16:00:54 +01001312unsigned FAST_FUNC sha512_end(sha512_ctx_t *ctx, void *resbuf)
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001313{
Denys Vlasenko36ab5852010-10-17 03:00:36 +02001314 unsigned bufpos = ctx->total64[0] & 127;
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001315
Denys Vlasenko36ab5852010-10-17 03:00:36 +02001316 /* Pad the buffer to the next 128-byte boundary with 0x80,0,0,0... */
Denys Vlasenko273abcb2010-10-16 22:43:34 +02001317 ctx->wbuffer[bufpos++] = 0x80;
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001318
Denis Vlasenkoc8329c92009-03-12 19:06:18 +00001319 while (1) {
Denys Vlasenkof6dacc22010-10-17 03:21:51 +02001320 unsigned remaining = 128 - bufpos;
1321 memset(ctx->wbuffer + bufpos, 0, remaining);
1322 if (remaining >= 16) {
Denis Vlasenkoc8329c92009-03-12 19:06:18 +00001323 /* Store the 128-bit counter of bits in the buffer in BE format */
1324 uint64_t t;
1325 t = ctx->total64[0] << 3;
Denys Vlasenko9ff50b82010-10-18 11:40:26 +02001326 t = SWAP_BE64(t);
Denys Vlasenko1f5e81f2013-06-27 01:03:19 +02001327 *(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 8]) = t;
Denis Vlasenkoc8329c92009-03-12 19:06:18 +00001328 t = (ctx->total64[1] << 3) | (ctx->total64[0] >> 61);
Denys Vlasenko9ff50b82010-10-18 11:40:26 +02001329 t = SWAP_BE64(t);
Denys Vlasenko1f5e81f2013-06-27 01:03:19 +02001330 *(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 16]) = t;
Denis Vlasenkoc8329c92009-03-12 19:06:18 +00001331 }
Denis Vlasenkoe9afc462009-03-15 02:28:05 +00001332 sha512_process_block128(ctx);
Denys Vlasenkof6dacc22010-10-17 03:21:51 +02001333 if (remaining >= 16)
Denis Vlasenkoc8329c92009-03-12 19:06:18 +00001334 break;
Denys Vlasenko36ab5852010-10-17 03:00:36 +02001335 bufpos = 0;
Denis Vlasenkoc8329c92009-03-12 19:06:18 +00001336 }
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001337
Denys Vlasenko245a4f82009-11-07 01:31:14 +01001338 if (BB_LITTLE_ENDIAN) {
1339 unsigned i;
1340 for (i = 0; i < ARRAY_SIZE(ctx->hash); ++i)
Denys Vlasenko9ff50b82010-10-18 11:40:26 +02001341 ctx->hash[i] = SWAP_BE64(ctx->hash[i]);
Denys Vlasenko245a4f82009-11-07 01:31:14 +01001342 }
Denis Vlasenkocd2cd312009-03-12 15:40:27 +00001343 memcpy(resbuf, ctx->hash, sizeof(ctx->hash));
Denys Vlasenko49ecee02017-01-24 16:00:54 +01001344 return sizeof(ctx->hash);
Denis Vlasenko56dceb92008-11-10 13:32:50 +00001345}
Denys Vlasenkob8935d02017-01-15 20:16:27 +01001346#endif /* NEED_SHA512 */
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001347
1348
1349/*
1350 * The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
1351 * Michael Peeters and Gilles Van Assche. For more information, feedback or
1352 * questions, please refer to our website: http://keccak.noekeon.org/
1353 *
1354 * Implementation by Ronny Van Keer,
1355 * hereby denoted as "the implementer".
1356 *
1357 * To the extent possible under law, the implementer has waived all copyright
1358 * and related or neighboring rights to the source code in this file.
1359 * http://creativecommons.org/publicdomain/zero/1.0/
1360 *
1361 * Busybox modifications (C) Lauri Kasanen, under the GPLv2.
1362 */
1363
Denys Vlasenko30a86522013-01-15 01:12:26 +01001364#if CONFIG_SHA3_SMALL < 0
1365# define SHA3_SMALL 0
1366#elif CONFIG_SHA3_SMALL > 1
1367# define SHA3_SMALL 1
1368#else
1369# define SHA3_SMALL CONFIG_SHA3_SMALL
1370#endif
1371
Denys Vlasenko2a563ea2014-07-25 17:24:13 +02001372#define OPTIMIZE_SHA3_FOR_32 0
1373/*
1374 * SHA3 can be optimized for 32-bit CPUs with bit-slicing:
1375 * every 64-bit word of state[] can be split into two 32-bit words
1376 * by even/odd bits. In this form, all rotations of sha3 round
1377 * are 32-bit - and there are lots of them.
1378 * However, it requires either splitting/combining state words
1379 * before/after sha3 round (code does this now)
1380 * or shuffling bits before xor'ing them into state and in sha3_end.
1381 * Without shuffling, bit-slicing results in -130 bytes of code
1382 * and marginal speedup (but of course it gives wrong result).
1383 * With shuffling it works, but +260 code bytes, and slower.
1384 * Disabled for now:
1385 */
1386#if 0 /* LONG_MAX == 0x7fffffff */
1387# undef OPTIMIZE_SHA3_FOR_32
1388# define OPTIMIZE_SHA3_FOR_32 1
1389#endif
1390
Denys Vlasenko2a563ea2014-07-25 17:24:13 +02001391#if OPTIMIZE_SHA3_FOR_32
1392/* This splits every 64-bit word into a pair of 32-bit words,
1393 * even bits go into first word, odd bits go to second one.
1394 * The conversion is done in-place.
1395 */
1396static void split_halves(uint64_t *state)
1397{
1398 /* Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
1399 uint32_t *s32 = (uint32_t*)state;
1400 uint32_t t, x0, x1;
1401 int i;
1402 for (i = 24; i >= 0; --i) {
1403 x0 = s32[0];
1404 t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
1405 t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
1406 t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
1407 t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
1408 x1 = s32[1];
1409 t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
1410 t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
1411 t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
1412 t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
1413 *s32++ = (x0 & 0x0000FFFF) | (x1 << 16);
1414 *s32++ = (x0 >> 16) | (x1 & 0xFFFF0000);
1415 }
1416}
1417/* The reverse operation */
1418static void combine_halves(uint64_t *state)
1419{
1420 uint32_t *s32 = (uint32_t*)state;
1421 uint32_t t, x0, x1;
1422 int i;
1423 for (i = 24; i >= 0; --i) {
1424 x0 = s32[0];
1425 x1 = s32[1];
1426 t = (x0 & 0x0000FFFF) | (x1 << 16);
1427 x1 = (x0 >> 16) | (x1 & 0xFFFF0000);
1428 x0 = t;
1429 t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
1430 t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
1431 t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
1432 t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
1433 *s32++ = x0;
1434 t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
1435 t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
1436 t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
1437 t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
1438 *s32++ = x1;
1439 }
1440}
1441#endif
1442
Denys Vlasenko5368fe52013-01-16 02:20:31 +01001443/*
1444 * In the crypto literature this function is usually called Keccak-f().
Denys Vlasenkoac4100e2013-01-15 16:27:39 +01001445 */
Denys Vlasenkoe4f0f262013-01-16 12:23:23 +01001446static void sha3_process_block72(uint64_t *state)
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001447{
Denys Vlasenko5368fe52013-01-16 02:20:31 +01001448 enum { NROUNDS = 24 };
1449
Denys Vlasenko2a563ea2014-07-25 17:24:13 +02001450#if OPTIMIZE_SHA3_FOR_32
1451 /*
Denys Vlasenko965b7952020-11-30 13:03:03 +01001452 static const uint32_t IOTA_CONST_0[NROUNDS] ALIGN4 = {
Denys Vlasenko2a563ea2014-07-25 17:24:13 +02001453 0x00000001UL,
1454 0x00000000UL,
1455 0x00000000UL,
1456 0x00000000UL,
1457 0x00000001UL,
1458 0x00000001UL,
1459 0x00000001UL,
1460 0x00000001UL,
1461 0x00000000UL,
1462 0x00000000UL,
1463 0x00000001UL,
1464 0x00000000UL,
1465 0x00000001UL,
1466 0x00000001UL,
1467 0x00000001UL,
1468 0x00000001UL,
1469 0x00000000UL,
1470 0x00000000UL,
1471 0x00000000UL,
1472 0x00000000UL,
1473 0x00000001UL,
1474 0x00000000UL,
1475 0x00000001UL,
1476 0x00000000UL,
1477 };
1478 ** bits are in lsb: 0101 0000 1111 0100 1111 0001
1479 */
1480 uint32_t IOTA_CONST_0bits = (uint32_t)(0x0050f4f1);
Denys Vlasenko965b7952020-11-30 13:03:03 +01001481 static const uint32_t IOTA_CONST_1[NROUNDS] ALIGN4 = {
Denys Vlasenko2a563ea2014-07-25 17:24:13 +02001482 0x00000000UL,
1483 0x00000089UL,
1484 0x8000008bUL,
1485 0x80008080UL,
1486 0x0000008bUL,
1487 0x00008000UL,
1488 0x80008088UL,
1489 0x80000082UL,
1490 0x0000000bUL,
1491 0x0000000aUL,
1492 0x00008082UL,
1493 0x00008003UL,
1494 0x0000808bUL,
1495 0x8000000bUL,
1496 0x8000008aUL,
1497 0x80000081UL,
1498 0x80000081UL,
1499 0x80000008UL,
1500 0x00000083UL,
1501 0x80008003UL,
1502 0x80008088UL,
1503 0x80000088UL,
1504 0x00008000UL,
1505 0x80008082UL,
1506 };
1507
1508 uint32_t *const s32 = (uint32_t*)state;
1509 unsigned round;
1510
1511 split_halves(state);
1512
1513 for (round = 0; round < NROUNDS; round++) {
1514 unsigned x;
1515
1516 /* Theta */
1517 {
1518 uint32_t BC[20];
1519 for (x = 0; x < 10; ++x) {
1520 BC[x+10] = BC[x] = s32[x]^s32[x+10]^s32[x+20]^s32[x+30]^s32[x+40];
1521 }
1522 for (x = 0; x < 10; x += 2) {
1523 uint32_t ta, tb;
1524 ta = BC[x+8] ^ rotl32(BC[x+3], 1);
1525 tb = BC[x+9] ^ BC[x+2];
1526 s32[x+0] ^= ta;
1527 s32[x+1] ^= tb;
1528 s32[x+10] ^= ta;
1529 s32[x+11] ^= tb;
1530 s32[x+20] ^= ta;
1531 s32[x+21] ^= tb;
1532 s32[x+30] ^= ta;
1533 s32[x+31] ^= tb;
1534 s32[x+40] ^= ta;
1535 s32[x+41] ^= tb;
1536 }
1537 }
1538 /* RhoPi */
1539 {
1540 uint32_t t0a,t0b, t1a,t1b;
1541 t1a = s32[1*2+0];
1542 t1b = s32[1*2+1];
1543
1544#define RhoPi(PI_LANE, ROT_CONST) \
1545 t0a = s32[PI_LANE*2+0];\
1546 t0b = s32[PI_LANE*2+1];\
1547 if (ROT_CONST & 1) {\
1548 s32[PI_LANE*2+0] = rotl32(t1b, ROT_CONST/2+1);\
1549 s32[PI_LANE*2+1] = ROT_CONST == 1 ? t1a : rotl32(t1a, ROT_CONST/2+0);\
1550 } else {\
1551 s32[PI_LANE*2+0] = rotl32(t1a, ROT_CONST/2);\
1552 s32[PI_LANE*2+1] = rotl32(t1b, ROT_CONST/2);\
1553 }\
1554 t1a = t0a; t1b = t0b;
1555
1556 RhoPi(10, 1)
1557 RhoPi( 7, 3)
1558 RhoPi(11, 6)
1559 RhoPi(17,10)
1560 RhoPi(18,15)
1561 RhoPi( 3,21)
1562 RhoPi( 5,28)
1563 RhoPi(16,36)
1564 RhoPi( 8,45)
1565 RhoPi(21,55)
1566 RhoPi(24, 2)
1567 RhoPi( 4,14)
1568 RhoPi(15,27)
1569 RhoPi(23,41)
1570 RhoPi(19,56)
1571 RhoPi(13, 8)
1572 RhoPi(12,25)
1573 RhoPi( 2,43)
1574 RhoPi(20,62)
1575 RhoPi(14,18)
1576 RhoPi(22,39)
1577 RhoPi( 9,61)
1578 RhoPi( 6,20)
1579 RhoPi( 1,44)
1580#undef RhoPi
1581 }
1582 /* Chi */
Denys Vlasenko4ff933c2014-07-30 14:18:57 +02001583 for (x = 0; x <= 40;) {
1584 uint32_t BC0, BC1, BC2, BC3, BC4;
1585 BC0 = s32[x + 0*2];
1586 BC1 = s32[x + 1*2];
1587 BC2 = s32[x + 2*2];
1588 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1589 BC3 = s32[x + 3*2];
1590 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1591 BC4 = s32[x + 4*2];
1592 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1593 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1594 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1595 x++;
1596 BC0 = s32[x + 0*2];
1597 BC1 = s32[x + 1*2];
1598 BC2 = s32[x + 2*2];
1599 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1600 BC3 = s32[x + 3*2];
1601 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1602 BC4 = s32[x + 4*2];
1603 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1604 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1605 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1606 x += 9;
Denys Vlasenko2a563ea2014-07-25 17:24:13 +02001607 }
1608 /* Iota */
1609 s32[0] ^= IOTA_CONST_0bits & 1;
1610 IOTA_CONST_0bits >>= 1;
1611 s32[1] ^= IOTA_CONST_1[round];
1612 }
1613
1614 combine_halves(state);
1615#else
Denys Vlasenko09a0e222014-07-30 16:26:09 +02001616 /* Native 64-bit algorithm */
Denys Vlasenko965b7952020-11-30 13:03:03 +01001617 static const uint16_t IOTA_CONST[NROUNDS] ALIGN2 = {
Denys Vlasenko09a0e222014-07-30 16:26:09 +02001618 /* Elements should be 64-bit, but top half is always zero
1619 * or 0x80000000. We encode 63rd bits in a separate word below.
1620 * Same is true for 31th bits, which lets us use 16-bit table
1621 * instead of 64-bit. The speed penalty is lost in the noise.
1622 */
Denys Vlasenko5368fe52013-01-16 02:20:31 +01001623 0x0001,
1624 0x8082,
1625 0x808a,
1626 0x8000,
1627 0x808b,
1628 0x0001,
1629 0x8081,
1630 0x8009,
1631 0x008a,
1632 0x0088,
1633 0x8009,
1634 0x000a,
1635 0x808b,
1636 0x008b,
1637 0x8089,
1638 0x8003,
1639 0x8002,
1640 0x0080,
1641 0x800a,
1642 0x000a,
1643 0x8081,
1644 0x8080,
1645 0x0001,
1646 0x8008,
1647 };
1648 /* bit for CONST[0] is in msb: 0011 0011 0000 0111 1101 1101 */
1649 const uint32_t IOTA_CONST_bit63 = (uint32_t)(0x3307dd00);
1650 /* bit for CONST[0] is in msb: 0001 0110 0011 1000 0001 1011 */
1651 const uint32_t IOTA_CONST_bit31 = (uint32_t)(0x16381b00);
1652
Denys Vlasenko965b7952020-11-30 13:03:03 +01001653 static const uint8_t ROT_CONST[24] ALIGN1 = {
Denys Vlasenko5368fe52013-01-16 02:20:31 +01001654 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
1655 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44,
1656 };
Denys Vlasenko965b7952020-11-30 13:03:03 +01001657 static const uint8_t PI_LANE[24] ALIGN1 = {
Denys Vlasenko5368fe52013-01-16 02:20:31 +01001658 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
1659 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1,
1660 };
Denys Vlasenko965b7952020-11-30 13:03:03 +01001661 /*static const uint8_t MOD5[10] ALIGN1 = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };*/
Denys Vlasenko8fb3ab52013-01-15 22:07:48 +01001662
Denys Vlasenko2a563ea2014-07-25 17:24:13 +02001663 unsigned x;
Denys Vlasenko5b7f50f2013-01-15 19:52:30 +01001664 unsigned round;
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001665
1666 if (BB_BIG_ENDIAN) {
1667 for (x = 0; x < 25; x++) {
1668 state[x] = SWAP_LE64(state[x]);
1669 }
1670 }
1671
Denys Vlasenko5368fe52013-01-16 02:20:31 +01001672 for (round = 0; round < NROUNDS; ++round) {
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001673 /* Theta */
Denys Vlasenko30a86522013-01-15 01:12:26 +01001674 {
Denys Vlasenkoa55df272013-01-15 15:22:30 +01001675 uint64_t BC[10];
Denys Vlasenko30a86522013-01-15 01:12:26 +01001676 for (x = 0; x < 5; ++x) {
Denys Vlasenkoa55df272013-01-15 15:22:30 +01001677 BC[x + 5] = BC[x] = state[x]
1678 ^ state[x + 5] ^ state[x + 10]
1679 ^ state[x + 15] ^ state[x + 20];
Denys Vlasenko30a86522013-01-15 01:12:26 +01001680 }
Denys Vlasenkoa55df272013-01-15 15:22:30 +01001681 /* Using 2x5 vector above eliminates the need to use
Denys Vlasenko5b7f50f2013-01-15 19:52:30 +01001682 * BC[MOD5[x+N]] trick below to fetch BC[(x+N) % 5],
Denys Vlasenkoa55df272013-01-15 15:22:30 +01001683 * and the code is a bit _smaller_.
1684 */
Denys Vlasenko30a86522013-01-15 01:12:26 +01001685 for (x = 0; x < 5; ++x) {
Denys Vlasenkoa55df272013-01-15 15:22:30 +01001686 uint64_t temp = BC[x + 4] ^ rotl64(BC[x + 1], 1);
Denys Vlasenko8fb3ab52013-01-15 22:07:48 +01001687 state[x] ^= temp;
1688 state[x + 5] ^= temp;
1689 state[x + 10] ^= temp;
1690 state[x + 15] ^= temp;
1691 state[x + 20] ^= temp;
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001692 }
1693 }
1694
1695 /* Rho Pi */
Denys Vlasenko30a86522013-01-15 01:12:26 +01001696 if (SHA3_SMALL) {
1697 uint64_t t1 = state[1];
1698 for (x = 0; x < 24; ++x) {
Denys Vlasenko5368fe52013-01-16 02:20:31 +01001699 uint64_t t0 = state[PI_LANE[x]];
1700 state[PI_LANE[x]] = rotl64(t1, ROT_CONST[x]);
Denys Vlasenko30a86522013-01-15 01:12:26 +01001701 t1 = t0;
1702 }
1703 } else {
Denys Vlasenkoa55df272013-01-15 15:22:30 +01001704 /* Especially large benefit for 32-bit arch (75% faster):
Denys Vlasenko30a86522013-01-15 01:12:26 +01001705 * 64-bit rotations by non-constant usually are SLOW on those.
1706 * We resort to unrolling here.
Denys Vlasenko5368fe52013-01-16 02:20:31 +01001707 * This optimizes out PI_LANE[] and ROT_CONST[],
Denys Vlasenko30a86522013-01-15 01:12:26 +01001708 * but generates 300-500 more bytes of code.
1709 */
1710 uint64_t t0;
1711 uint64_t t1 = state[1];
1712#define RhoPi_twice(x) \
Denys Vlasenko5368fe52013-01-16 02:20:31 +01001713 t0 = state[PI_LANE[x ]]; \
1714 state[PI_LANE[x ]] = rotl64(t1, ROT_CONST[x ]); \
1715 t1 = state[PI_LANE[x+1]]; \
1716 state[PI_LANE[x+1]] = rotl64(t0, ROT_CONST[x+1]);
Denys Vlasenko30a86522013-01-15 01:12:26 +01001717 RhoPi_twice(0); RhoPi_twice(2);
1718 RhoPi_twice(4); RhoPi_twice(6);
1719 RhoPi_twice(8); RhoPi_twice(10);
1720 RhoPi_twice(12); RhoPi_twice(14);
1721 RhoPi_twice(16); RhoPi_twice(18);
1722 RhoPi_twice(20); RhoPi_twice(22);
1723#undef RhoPi_twice
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001724 }
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001725 /* Chi */
Denys Vlasenko09a0e222014-07-30 16:26:09 +02001726# if LONG_MAX > 0x7fffffff
Denys Vlasenko2a563ea2014-07-25 17:24:13 +02001727 for (x = 0; x <= 20; x += 5) {
Denys Vlasenko8fb3ab52013-01-15 22:07:48 +01001728 uint64_t BC0, BC1, BC2, BC3, BC4;
Denys Vlasenko2a563ea2014-07-25 17:24:13 +02001729 BC0 = state[x + 0];
1730 BC1 = state[x + 1];
1731 BC2 = state[x + 2];
1732 state[x + 0] = BC0 ^ ((~BC1) & BC2);
1733 BC3 = state[x + 3];
1734 state[x + 1] = BC1 ^ ((~BC2) & BC3);
1735 BC4 = state[x + 4];
1736 state[x + 2] = BC2 ^ ((~BC3) & BC4);
1737 state[x + 3] = BC3 ^ ((~BC4) & BC0);
1738 state[x + 4] = BC4 ^ ((~BC0) & BC1);
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001739 }
Denys Vlasenko09a0e222014-07-30 16:26:09 +02001740# else
Denys Vlasenko4ff933c2014-07-30 14:18:57 +02001741 /* Reduced register pressure version
1742 * for register-starved 32-bit arches
1743 * (i386: -95 bytes, and it is _faster_)
1744 */
1745 for (x = 0; x <= 40;) {
1746 uint32_t BC0, BC1, BC2, BC3, BC4;
1747 uint32_t *const s32 = (uint32_t*)state;
Denys Vlasenko09a0e222014-07-30 16:26:09 +02001748# if SHA3_SMALL
Denys Vlasenko4ff933c2014-07-30 14:18:57 +02001749 do_half:
Denys Vlasenko09a0e222014-07-30 16:26:09 +02001750# endif
Denys Vlasenko4ff933c2014-07-30 14:18:57 +02001751 BC0 = s32[x + 0*2];
1752 BC1 = s32[x + 1*2];
1753 BC2 = s32[x + 2*2];
1754 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1755 BC3 = s32[x + 3*2];
1756 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1757 BC4 = s32[x + 4*2];
1758 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1759 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1760 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1761 x++;
Denys Vlasenko09a0e222014-07-30 16:26:09 +02001762# if SHA3_SMALL
Denys Vlasenko4ff933c2014-07-30 14:18:57 +02001763 if (x & 1)
1764 goto do_half;
1765 x += 8;
Denys Vlasenko09a0e222014-07-30 16:26:09 +02001766# else
Denys Vlasenko4ff933c2014-07-30 14:18:57 +02001767 BC0 = s32[x + 0*2];
1768 BC1 = s32[x + 1*2];
1769 BC2 = s32[x + 2*2];
1770 s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1771 BC3 = s32[x + 3*2];
1772 s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1773 BC4 = s32[x + 4*2];
1774 s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1775 s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1776 s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1777 x += 9;
Denys Vlasenko09a0e222014-07-30 16:26:09 +02001778# endif
Denys Vlasenko4ff933c2014-07-30 14:18:57 +02001779 }
Denys Vlasenko09a0e222014-07-30 16:26:09 +02001780# endif /* long is 32-bit */
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001781 /* Iota */
Denys Vlasenko5368fe52013-01-16 02:20:31 +01001782 state[0] ^= IOTA_CONST[round]
1783 | (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000)
1784 | (uint64_t)((IOTA_CONST_bit63 << round) & 0x80000000) << 32;
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001785 }
1786
1787 if (BB_BIG_ENDIAN) {
1788 for (x = 0; x < 25; x++) {
1789 state[x] = SWAP_LE64(state[x]);
1790 }
1791 }
Denys Vlasenko2a563ea2014-07-25 17:24:13 +02001792#endif
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001793}
1794
1795void FAST_FUNC sha3_begin(sha3_ctx_t *ctx)
1796{
1797 memset(ctx, 0, sizeof(*ctx));
Denys Vlasenko71a090f2016-08-29 14:05:25 +02001798 /* SHA3-512, user can override */
1799 ctx->input_block_bytes = (1600 - 512*2) / 8; /* 72 bytes */
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001800}
1801
Denys Vlasenko2cfcc9e2013-01-20 00:38:09 +01001802void FAST_FUNC sha3_hash(sha3_ctx_t *ctx, const void *buffer, size_t len)
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001803{
Denys Vlasenko2cfcc9e2013-01-20 00:38:09 +01001804#if SHA3_SMALL
1805 const uint8_t *data = buffer;
1806 unsigned bufpos = ctx->bytes_queued;
1807
1808 while (1) {
Denys Vlasenko71a090f2016-08-29 14:05:25 +02001809 unsigned remaining = ctx->input_block_bytes - bufpos;
Denys Vlasenko2cfcc9e2013-01-20 00:38:09 +01001810 if (remaining > len)
1811 remaining = len;
1812 len -= remaining;
1813 /* XOR data into buffer */
1814 while (remaining != 0) {
1815 uint8_t *buf = (uint8_t*)ctx->state;
1816 buf[bufpos] ^= *data++;
1817 bufpos++;
1818 remaining--;
1819 }
Denys Vlasenkoabefc3c2020-09-30 22:22:04 +02001820
Denys Vlasenko2cfcc9e2013-01-20 00:38:09 +01001821 /* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
Denys Vlasenko71a090f2016-08-29 14:05:25 +02001822 bufpos -= ctx->input_block_bytes;
Denys Vlasenko2cfcc9e2013-01-20 00:38:09 +01001823 if (bufpos != 0)
1824 break;
Denys Vlasenkoabefc3c2020-09-30 22:22:04 +02001825
Denys Vlasenko2cfcc9e2013-01-20 00:38:09 +01001826 /* Buffer is filled up, process it */
1827 sha3_process_block72(ctx->state);
1828 /*bufpos = 0; - already is */
1829 }
Denys Vlasenko71a090f2016-08-29 14:05:25 +02001830 ctx->bytes_queued = bufpos + ctx->input_block_bytes;
Denys Vlasenko2cfcc9e2013-01-20 00:38:09 +01001831#else
1832 /* +50 bytes code size, but a bit faster because of long-sized XORs */
1833 const uint8_t *data = buffer;
1834 unsigned bufpos = ctx->bytes_queued;
Denys Vlasenko71a090f2016-08-29 14:05:25 +02001835 unsigned iblk_bytes = ctx->input_block_bytes;
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001836
1837 /* If already data in queue, continue queuing first */
Denys Vlasenko71a090f2016-08-29 14:05:25 +02001838 if (bufpos != 0) {
1839 while (len != 0) {
1840 uint8_t *buf = (uint8_t*)ctx->state;
1841 buf[bufpos] ^= *data++;
1842 len--;
1843 bufpos++;
1844 if (bufpos == iblk_bytes) {
1845 bufpos = 0;
1846 goto do_block;
1847 }
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001848 }
1849 }
1850
1851 /* Absorb complete blocks */
Denys Vlasenko71a090f2016-08-29 14:05:25 +02001852 while (len >= iblk_bytes) {
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001853 /* XOR data onto beginning of state[].
Denys Vlasenko2cfcc9e2013-01-20 00:38:09 +01001854 * We try to be efficient - operate one word at a time, not byte.
1855 * Careful wrt unaligned access: can't just use "*(long*)data"!
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001856 */
Denys Vlasenko71a090f2016-08-29 14:05:25 +02001857 unsigned count = iblk_bytes / sizeof(long);
Denys Vlasenko2cfcc9e2013-01-20 00:38:09 +01001858 long *buf = (long*)ctx->state;
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001859 do {
1860 long v;
1861 move_from_unaligned_long(v, (long*)data);
Denys Vlasenko2cfcc9e2013-01-20 00:38:09 +01001862 *buf++ ^= v;
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001863 data += sizeof(long);
1864 } while (--count);
Denys Vlasenko71a090f2016-08-29 14:05:25 +02001865 len -= iblk_bytes;
Denys Vlasenko2cfcc9e2013-01-20 00:38:09 +01001866 do_block:
Denys Vlasenkoe4f0f262013-01-16 12:23:23 +01001867 sha3_process_block72(ctx->state);
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001868 }
1869
1870 /* Queue remaining data bytes */
Denys Vlasenko2cfcc9e2013-01-20 00:38:09 +01001871 while (len != 0) {
1872 uint8_t *buf = (uint8_t*)ctx->state;
1873 buf[bufpos] ^= *data++;
1874 bufpos++;
1875 len--;
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001876 }
Denys Vlasenko970aa6b2013-01-15 22:19:24 +01001877
Denys Vlasenko2cfcc9e2013-01-20 00:38:09 +01001878 ctx->bytes_queued = bufpos;
1879#endif
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001880}
1881
Denys Vlasenko49ecee02017-01-24 16:00:54 +01001882unsigned FAST_FUNC sha3_end(sha3_ctx_t *ctx, void *resbuf)
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001883{
1884 /* Padding */
Denys Vlasenko2cfcc9e2013-01-20 00:38:09 +01001885 uint8_t *buf = (uint8_t*)ctx->state;
Denys Vlasenko71a090f2016-08-29 14:05:25 +02001886 /*
1887 * Keccak block padding is: add 1 bit after last bit of input,
1888 * then add zero bits until the end of block, and add the last 1 bit
1889 * (the last bit in the block) - the "10*1" pattern.
1890 * SHA3 standard appends additional two bits, 01, before that padding:
1891 *
1892 * SHA3-224(M) = KECCAK[448](M||01, 224)
1893 * SHA3-256(M) = KECCAK[512](M||01, 256)
1894 * SHA3-384(M) = KECCAK[768](M||01, 384)
1895 * SHA3-512(M) = KECCAK[1024](M||01, 512)
1896 * (M is the input, || is bit concatenation)
1897 *
1898 * The 6 below contains 01 "SHA3" bits and the first 1 "Keccak" bit:
1899 */
1900 buf[ctx->bytes_queued] ^= 6; /* bit pattern 00000110 */
1901 buf[ctx->input_block_bytes - 1] ^= 0x80;
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001902
Denys Vlasenkoe4f0f262013-01-16 12:23:23 +01001903 sha3_process_block72(ctx->state);
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001904
1905 /* Output */
Denys Vlasenko2cfcc9e2013-01-20 00:38:09 +01001906 memcpy(resbuf, ctx->state, 64);
Denys Vlasenko49ecee02017-01-24 16:00:54 +01001907 return 64;
Lauri Kasanenb8173b62013-01-14 05:20:50 +01001908}