A really nice patch from Manuel Novoa III for compile time
configurable size/speed tradeoffs.
diff --git a/md5sum.c b/md5sum.c
index dcb05c1..643f827 100644
--- a/md5sum.c
+++ b/md5sum.c
@@ -20,6 +20,24 @@
 /* Written by Ulrich Drepper <drepper@gnu.ai.mit.edu> */
 /* Hacked to work with BusyBox by Alfred M. Szmidt <ams@trillian.itslinux.org> */
 
+/*
+ * June 29, 2001        Manuel Novoa III
+ *
+ * Added MD5SUM_SIZE_VS_SPEED configuration option.
+ *
+ * Current valid values, with data from my system for comparison, are:
+ *   (using uClibc and running on linux-2.4.4.tar.bz2)
+ *                     user times (sec)  text size (386)
+ *     0 (fastest)         1.1                6144
+ *     1                   1.4                5392
+ *     2                   3.0                5088
+ *     3 (smallest)        5.1                4912
+ */
+
+#define MD5SUM_SIZE_VS_SPEED 2
+
+/**********************************************************************/
+
 #include <stdio.h>
 #include <errno.h>
 #include <ctype.h>
@@ -184,9 +202,11 @@
 
 
 
+#if MD5SUM_SIZE_VS_SPEED == 0
 /* This array contains the bytes used to pad the buffer to the next
    64-byte boundary.  (RFC 1321, 3.1: Step 1)  */
 static const unsigned char fillbuf[64] = { 0x80, 0 /* , 0, 0, ...  */  };
+#endif
 
 /* Initialize structure containing state of computation.
    (RFC 1321, 3.3: Step 3)  */
@@ -233,7 +253,12 @@
     ++ctx->total[1];
 
   pad = bytes >= 56 ? 64 + 56 - bytes : 56 - bytes;
+#if MD5SUM_SIZE_VS_SPEED > 0
+  memset(&ctx->buffer[bytes], 0, pad);
+  ctx->buffer[bytes] = 0x80;
+#else
   memcpy(&ctx->buffer[bytes], fillbuf, pad);
+#endif
 
   /* Put the 64-bit file length in *bits* at the end of the buffer.  */
   *(md5_uint32 *) & ctx->buffer[bytes + pad] = SWAP(ctx->total[0] << 3);
@@ -369,6 +394,49 @@
   const md5_uint32 *words = buffer;
   size_t nwords = len / sizeof(md5_uint32);
   const md5_uint32 *endp = words + nwords;
+#if MD5SUM_SIZE_VS_SPEED > 0
+  static const md5_uint32 C_array[] = {
+      /* round 1 */
+      0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
+      0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
+      0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
+      0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
+      /* round 2 */
+      0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
+      0xd62f105d, 0x2441453,  0xd8a1e681, 0xe7d3fbc8,
+      0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
+      0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
+      /* round 3 */
+      0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
+      0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
+      0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x4881d05,
+      0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
+      /* round 4 */
+      0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
+      0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
+      0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
+      0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
+  };
+
+  static const char P_array[] = {
+#if MD5SUM_SIZE_VS_SPEED > 1
+      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, /* 1 */
+#endif
+      1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, /* 2 */
+      5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, /* 3 */
+      0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9  /* 4 */
+  };
+
+#if MD5SUM_SIZE_VS_SPEED > 1
+  static const char S_array[] = {
+      7, 12, 17, 22,
+      5, 9, 14, 20,
+      4, 11, 16, 23,
+      6, 10, 15, 21
+  };
+#endif
+#endif
+
   md5_uint32 A = ctx->A;
   md5_uint32 B = ctx->B;
   md5_uint32 C = ctx->C;
@@ -390,6 +458,79 @@
     md5_uint32 C_save = C;
     md5_uint32 D_save = D;
 
+#if MD5SUM_SIZE_VS_SPEED > 1
+#define CYCLIC(w, s) (w = (w << s) | (w >> (32 - s)))
+
+    const md5_uint32 *pc;
+    const char *pp;
+    const char *ps;
+    int i;
+    md5_uint32 temp;
+
+    for ( i=0 ; i < 16 ; i++ ) {
+	cwp[i] = SWAP(words[i]);
+    }
+    words += 16;
+
+#if MD5SUM_SIZE_VS_SPEED > 2
+    pc = C_array; pp = P_array; ps = S_array - 4;
+
+    for ( i = 0 ; i < 64 ; i++ ) {
+	if ((i&0x0f) == 0) ps += 4;
+	temp = A;
+	switch (i>>4) {
+	    case 0:
+		temp += FF(B,C,D);
+		break;
+	    case 1:
+		temp += FG(B,C,D);
+		break;
+	    case 2:
+		temp += FH(B,C,D);
+		break;
+	    case 3:
+		temp += FI(B,C,D);
+		break;
+	}
+	temp += cwp[(int)(*pp++)] + *pc++;
+	temp = CYCLIC (temp, ps[i&3]);
+	temp += B;
+	A = D; D = C; C = B; B = temp;
+    }
+#else
+    pc = C_array; pp = P_array; ps = S_array;
+
+    for ( i = 0 ; i < 16 ; i++ ) {
+	temp = A + FF(B,C,D) + cwp[(int)(*pp++)] + *pc++;
+	temp = CYCLIC (temp, ps[i&3]);
+	temp += B;
+	A = D; D = C; C = B; B = temp;
+    }
+
+    ps += 4;
+    for ( i = 0 ; i < 16 ; i++ ) {
+	temp = A + FG(B,C,D) + cwp[(int)(*pp++)] + *pc++;
+	temp = CYCLIC (temp, ps[i&3]);
+	temp += B;
+	A = D; D = C; C = B; B = temp;
+    }
+    ps += 4;
+    for ( i = 0 ; i < 16 ; i++ ) {
+	temp = A + FH(B,C,D) + cwp[(int)(*pp++)] + *pc++;
+	temp = CYCLIC (temp, ps[i&3]);
+	temp += B;
+	A = D; D = C; C = B; B = temp;
+    }
+    ps += 4;
+    for ( i = 0 ; i < 16 ; i++ ) {
+	temp = A + FI(B,C,D) + cwp[(int)(*pp++)] + *pc++;
+	temp = CYCLIC (temp, ps[i&3]);
+	temp += B;
+	A = D; D = C; C = B; B = temp;
+    }
+
+#endif
+#else
     /* First round: using the given function, the context and a constant
        the next context is computed.  Because the algorithms processing
        unit is a 32-bit word and it is determined to work on words in
@@ -417,7 +558,22 @@
        T[i] = (int) (4294967296.0 * fabs (sin (i))), i=1..64
     */
 
+#if MD5SUM_SIZE_VS_SPEED == 1
+    const md5_uint32 *pc;
+    const char *pp;
+    int i;
+#endif
+
     /* Round 1.  */
+#if MD5SUM_SIZE_VS_SPEED == 1
+    pc = C_array;
+    for ( i=0 ; i < 4 ; i++ ) {
+	OP(A, B, C, D, 7, *pc++);
+	OP(D, A, B, C, 12, *pc++);
+	OP(C, D, A, B, 17, *pc++);
+	OP(B, C, D, A, 22, *pc++);
+    }
+#else
     OP(A, B, C, D, 7, 0xd76aa478);
     OP(D, A, B, C, 12, 0xe8c7b756);
     OP(C, D, A, B, 17, 0x242070db);
@@ -434,6 +590,7 @@
     OP(D, A, B, C, 12, 0xfd987193);
     OP(C, D, A, B, 17, 0xa679438e);
     OP(B, C, D, A, 22, 0x49b40821);
+#endif
 
     /* For the second to fourth round we have the possibly swapped words
        in CORRECT_WORDS.  Redefine the macro to take an additional first
@@ -449,6 +606,15 @@
       while (0)
 
     /* Round 2.  */
+#if MD5SUM_SIZE_VS_SPEED == 1
+    pp = P_array;
+    for ( i=0 ; i < 4 ; i++ ) {
+	OP(FG, A, B, C, D, (int)(*pp++), 5, *pc++);
+	OP(FG, D, A, B, C, (int)(*pp++), 9, *pc++);
+	OP(FG, C, D, A, B, (int)(*pp++), 14, *pc++);
+	OP(FG, B, C, D, A, (int)(*pp++), 20, *pc++);
+    }
+#else
     OP(FG, A, B, C, D, 1, 5, 0xf61e2562);
     OP(FG, D, A, B, C, 6, 9, 0xc040b340);
     OP(FG, C, D, A, B, 11, 14, 0x265e5a51);
@@ -465,8 +631,17 @@
     OP(FG, D, A, B, C, 2, 9, 0xfcefa3f8);
     OP(FG, C, D, A, B, 7, 14, 0x676f02d9);
     OP(FG, B, C, D, A, 12, 20, 0x8d2a4c8a);
+#endif
 
     /* Round 3.  */
+#if MD5SUM_SIZE_VS_SPEED == 1
+    for ( i=0 ; i < 4 ; i++ ) {
+	OP(FH, A, B, C, D, (int)(*pp++), 4, *pc++);
+	OP(FH, D, A, B, C, (int)(*pp++), 11, *pc++);
+	OP(FH, C, D, A, B, (int)(*pp++), 16, *pc++);
+	OP(FH, B, C, D, A, (int)(*pp++), 23, *pc++);
+    }
+#else
     OP(FH, A, B, C, D, 5, 4, 0xfffa3942);
     OP(FH, D, A, B, C, 8, 11, 0x8771f681);
     OP(FH, C, D, A, B, 11, 16, 0x6d9d6122);
@@ -483,8 +658,17 @@
     OP(FH, D, A, B, C, 12, 11, 0xe6db99e5);
     OP(FH, C, D, A, B, 15, 16, 0x1fa27cf8);
     OP(FH, B, C, D, A, 2, 23, 0xc4ac5665);
+#endif
 
     /* Round 4.  */
+#if MD5SUM_SIZE_VS_SPEED == 1
+    for ( i=0 ; i < 4 ; i++ ) {
+	OP(FI, A, B, C, D, (int)(*pp++), 6, *pc++);
+	OP(FI, D, A, B, C, (int)(*pp++), 10, *pc++);
+	OP(FI, C, D, A, B, (int)(*pp++), 15, *pc++);
+	OP(FI, B, C, D, A, (int)(*pp++), 21, *pc++);
+    }
+#else
     OP(FI, A, B, C, D, 0, 6, 0xf4292244);
     OP(FI, D, A, B, C, 7, 10, 0x432aff97);
     OP(FI, C, D, A, B, 14, 15, 0xab9423a7);
@@ -501,6 +685,8 @@
     OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
     OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
     OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
+#endif
+#endif
 
     /* Add the starting values of the context.  */
     A += A_save;