libbb: add optionl support for SHA256/512 encrypted passwords

function                                             old     new   delta
sha_crypt                                              -    2423   +2423
cryptpw_main                                         128     183     +55
to64                                                   -      29     +29
pw_encrypt                                           974    1000     +26
str_rounds                                             -      11     +11
login_main                                          1532    1541      +9
packed_usage                                       25215   25200     -15
__md5_to64                                            29       -     -29
------------------------------------------------------------------------------
(add/remove: 3/1 grow/shrink: 3/1 up/down: 2553/-44)         Total: 2509 bytes

diff --git a/libbb/pw_encrypt_sha.c b/libbb/pw_encrypt_sha.c
new file mode 100644
index 0000000..9acbabb
--- /dev/null
+++ b/libbb/pw_encrypt_sha.c
@@ -0,0 +1,251 @@
+/* SHA256 and SHA512-based Unix crypt implementation.
+ * Released into the Public Domain by Ulrich Drepper <drepper@redhat.com>.
+ */
+
+/* Prefix for optional rounds specification.  */
+static const char str_rounds[] = "rounds=%u$";
+
+/* Maximum salt string length.  */
+#define SALT_LEN_MAX 16
+/* Default number of rounds if not explicitly specified.  */
+#define ROUNDS_DEFAULT 5000
+/* Minimum number of rounds.  */
+#define ROUNDS_MIN 1000
+/* Maximum number of rounds.  */
+#define ROUNDS_MAX 999999999
+
+static char *
+NOINLINE
+sha_crypt(/*const*/ char *key_data, /*const*/ char *salt_data)
+{
+	void (*sha_begin)(void *ctx) FAST_FUNC;
+	void (*sha_hash)(const void *buffer, size_t len, void *ctx) FAST_FUNC;
+	void* (*sha_end)(void *resbuf, void *ctx) FAST_FUNC;
+	int _32or64;
+
+	char *result, *resptr;
+
+	/* btw, sha256 needs [32] and uint32_t only */
+	unsigned char alt_result[64] __attribute__((__aligned__(__alignof__(uint64_t))));
+	unsigned char temp_result[64] __attribute__((__aligned__(__alignof__(uint64_t))));
+	union {
+		sha256_ctx_t x;
+		sha512_ctx_t y;
+	} ctx;
+	union {
+		sha256_ctx_t x;
+		sha512_ctx_t y;
+	} alt_ctx;
+	unsigned salt_len;
+	unsigned key_len;
+	unsigned cnt;
+	unsigned rounds;
+	char *cp;
+	char is_sha512;
+
+	/* Analyze salt, construct already known part of result */
+	cnt = strlen(salt_data) + 1 + 43 + 1;
+	is_sha512 = salt_data[1];
+	if (is_sha512 == '6')
+		cnt += 43;
+	result = resptr = xzalloc(cnt); /* will provide NUL terminator */
+	*resptr++ = '$';
+	*resptr++ = is_sha512;
+	*resptr++ = '$';
+	rounds = ROUNDS_DEFAULT;
+	salt_data += 3;
+	if (strncmp(salt_data, str_rounds, 7) == 0) {
+		/* 7 == strlen("rounds=") */
+		char *endp;
+		unsigned srounds = bb_strtou(salt_data + 7, &endp, 10);
+		if (*endp == '$') {
+			salt_data = endp + 1;
+			rounds = srounds;
+			if (rounds < ROUNDS_MIN)
+				rounds = ROUNDS_MIN;
+			if (rounds > ROUNDS_MAX)
+				rounds = ROUNDS_MAX;
+		}
+	}
+	salt_len = strchrnul(salt_data, '$') - salt_data;
+	if (salt_len > SALT_LEN_MAX)
+		salt_len = SALT_LEN_MAX;
+	/* xstrdup assures suitable alignment; also we will use it
+	   as a scratch space later. */
+	salt_data = xstrndup(salt_data, salt_len);
+	if (rounds != ROUNDS_DEFAULT) /* add "rounds=NNNNN$" */
+		resptr += sprintf(resptr, str_rounds, rounds);
+	strcpy(resptr, salt_data);
+	resptr += salt_len;
+	*resptr++ = '$';
+	/* key data doesn't need much processing */
+	key_len = strlen(key_data);
+	key_data = xstrdup(key_data);
+
+	/* Which flavor of SHAnnn ops to use? */
+	sha_begin = (void*)sha256_begin;
+	sha_hash = (void*)sha256_hash;
+	sha_end = (void*)sha256_end;
+	_32or64 = 32;
+	if (is_sha512 == '6') {
+		sha_begin = (void*)sha512_begin;
+		sha_hash = (void*)sha512_hash;
+		sha_end = (void*)sha512_end;
+		_32or64 = 64;
+	}
+
+	/* Add KEY, SALT.  */
+	sha_begin(&ctx);
+	sha_hash(key_data, key_len, &ctx);
+	sha_hash(salt_data, salt_len, &ctx);
+
+	/* Compute alternate SHA sum with input KEY, SALT, and KEY.
+	   The final result will be added to the first context.  */
+	sha_begin(&alt_ctx);
+	sha_hash(key_data, key_len, &alt_ctx);
+	sha_hash(salt_data, salt_len, &alt_ctx);
+	sha_hash(key_data, key_len, &alt_ctx);
+	sha_end(alt_result, &alt_ctx);
+
+	/* Add result of this to the other context.  */
+	/* Add for any character in the key one byte of the alternate sum.  */
+	for (cnt = key_len; cnt > _32or64; cnt -= _32or64)
+		sha_hash(alt_result, _32or64, &ctx);
+	sha_hash(alt_result, cnt, &ctx);
+
+	/* Take the binary representation of the length of the key and for every
+	   1 add the alternate sum, for every 0 the key.  */
+	for (cnt = key_len; cnt != 0; cnt >>= 1)
+		if ((cnt & 1) != 0)
+			sha_hash(alt_result, _32or64, &ctx);
+		else
+			sha_hash(key_data, key_len, &ctx);
+
+	/* Create intermediate result.  */
+	sha_end(alt_result, &ctx);
+
+	/* Start computation of P byte sequence.  */
+	/* For every character in the password add the entire password.  */
+	sha_begin(&alt_ctx);
+	for (cnt = 0; cnt < key_len; ++cnt)
+		sha_hash(key_data, key_len, &alt_ctx);
+	sha_end(temp_result, &alt_ctx);
+
+	/* NB: past this point, raw key_data is not used anymore */
+
+	/* Create byte sequence P.  */
+#define p_bytes key_data /* reuse the buffer as it is of the key_len size */
+	cp = p_bytes; /* was: ... = alloca(key_len); */
+	for (cnt = key_len; cnt >= _32or64; cnt -= _32or64) {
+		cp = memcpy(cp, temp_result, _32or64);
+		cp += _32or64;
+	}
+	memcpy(cp, temp_result, cnt);
+
+	/* Start computation of S byte sequence.  */
+	/* For every character in the password add the entire password.  */
+	sha_begin(&alt_ctx);
+	for (cnt = 0; cnt < 16 + alt_result[0]; ++cnt)
+		sha_hash(salt_data, salt_len, &alt_ctx);
+	sha_end(temp_result, &alt_ctx);
+
+	/* NB: past this point, raw salt_data is not used anymore */
+
+	/* Create byte sequence S.  */
+#define s_bytes salt_data /* reuse the buffer as it is of the salt_len size */
+	cp = s_bytes; /* was: ... = alloca(salt_len); */
+	for (cnt = salt_len; cnt >= _32or64; cnt -= _32or64) {
+		cp = memcpy(cp, temp_result, _32or64);
+		cp += _32or64;
+	}
+	memcpy(cp, temp_result, cnt);
+
+	/* Repeatedly run the collected hash value through SHA to burn
+	   CPU cycles.  */
+	for (cnt = 0; cnt < rounds; ++cnt) {
+		sha_begin(&ctx);
+
+		/* Add key or last result.  */
+		if ((cnt & 1) != 0)
+			sha_hash(p_bytes, key_len, &ctx);
+		else
+			sha_hash(alt_result, _32or64, &ctx);
+		/* Add salt for numbers not divisible by 3.  */
+		if (cnt % 3 != 0)
+			sha_hash(s_bytes, salt_len, &ctx);
+		/* Add key for numbers not divisible by 7.  */
+		if (cnt % 7 != 0)
+			sha_hash(p_bytes, key_len, &ctx);
+		/* Add key or last result.  */
+		if ((cnt & 1) != 0)
+			sha_hash(alt_result, _32or64, &ctx);
+		else
+			sha_hash(p_bytes, key_len, &ctx);
+
+		sha_end(alt_result, &ctx);
+	}
+
+
+	/* Append encrypted password to result buffer */
+//TODO: replace with something like
+//	bb_uuencode(cp, src, length, bb_uuenc_tbl_XXXbase64);
+#define b64_from_24bit(B2, B1, B0, N) \
+do {							\
+	unsigned w = ((B2) << 16) | ((B1) << 8) | (B0);	\
+	resptr = to64(resptr, w, N);			\
+} while (0)
+	if (is_sha512 == '5') {
+		b64_from_24bit(alt_result[0], alt_result[10], alt_result[20], 4);
+		b64_from_24bit(alt_result[21], alt_result[1], alt_result[11], 4);
+		b64_from_24bit(alt_result[12], alt_result[22], alt_result[2], 4);
+		b64_from_24bit(alt_result[3], alt_result[13], alt_result[23], 4);
+		b64_from_24bit(alt_result[24], alt_result[4], alt_result[14], 4);
+		b64_from_24bit(alt_result[15], alt_result[25], alt_result[5], 4);
+		b64_from_24bit(alt_result[6], alt_result[16], alt_result[26], 4);
+		b64_from_24bit(alt_result[27], alt_result[7], alt_result[17], 4);
+		b64_from_24bit(alt_result[18], alt_result[28], alt_result[8], 4);
+		b64_from_24bit(alt_result[9], alt_result[19], alt_result[29], 4);
+		b64_from_24bit(0, alt_result[31], alt_result[30], 3);
+	} else {
+		b64_from_24bit(alt_result[0], alt_result[21], alt_result[42], 4);
+		b64_from_24bit(alt_result[22], alt_result[43], alt_result[1], 4);
+		b64_from_24bit(alt_result[44], alt_result[2], alt_result[23], 4);
+		b64_from_24bit(alt_result[3], alt_result[24], alt_result[45], 4);
+		b64_from_24bit(alt_result[25], alt_result[46], alt_result[4], 4);
+		b64_from_24bit(alt_result[47], alt_result[5], alt_result[26], 4);
+		b64_from_24bit(alt_result[6], alt_result[27], alt_result[48], 4);
+		b64_from_24bit(alt_result[28], alt_result[49], alt_result[7], 4);
+		b64_from_24bit(alt_result[50], alt_result[8], alt_result[29], 4);
+		b64_from_24bit(alt_result[9], alt_result[30], alt_result[51], 4);
+		b64_from_24bit(alt_result[31], alt_result[52], alt_result[10], 4);
+		b64_from_24bit(alt_result[53], alt_result[11], alt_result[32], 4);
+		b64_from_24bit(alt_result[12], alt_result[33], alt_result[54], 4);
+		b64_from_24bit(alt_result[34], alt_result[55], alt_result[13], 4);
+		b64_from_24bit(alt_result[56], alt_result[14], alt_result[35], 4);
+		b64_from_24bit(alt_result[15], alt_result[36], alt_result[57], 4);
+		b64_from_24bit(alt_result[37], alt_result[58], alt_result[16], 4);
+		b64_from_24bit(alt_result[59], alt_result[17], alt_result[38], 4);
+		b64_from_24bit(alt_result[18], alt_result[39], alt_result[60], 4);
+		b64_from_24bit(alt_result[40], alt_result[61], alt_result[19], 4);
+		b64_from_24bit(alt_result[62], alt_result[20], alt_result[41], 4);
+		b64_from_24bit(0, 0, alt_result[63], 2);
+	}
+	/* *resptr = '\0'; - xzalloc did it */
+#undef b64_from_24bit
+
+	/* Clear the buffer for the intermediate result so that people
+	   attaching to processes or reading core dumps cannot get any
+	   information.  */
+	memset(temp_result, 0, sizeof(temp_result));
+	memset(alt_result, 0, sizeof(alt_result));
+	memset(&ctx, 0, sizeof(ctx));
+	memset(&alt_ctx, 0, sizeof(alt_ctx));
+	memset(key_data, 0, key_len); /* also p_bytes */
+	memset(salt_data, 0, salt_len); /* also s_bytes */
+	free(key_data);
+	free(salt_data);
+#undef p_bytes
+#undef s_bytes
+
+	return result;
+}