libbb/sha1: add config-selectable fully unrolled version, closes 14391

function old new delta sha1_process_block64 364 4167 +3803 static.rconsts 16 - -16 ------------------------------------------------------------------------------ (add/remove: 0/1 grow/shrink: 1/0 up/down: 3803/-16) Total: 3787 bytes Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
2021-12-30 13:07:12 +01:00
parent 9173c9cce4
commit 25aadc893d
2 changed files with 95 additions and 14 deletions
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -42,21 +42,32 @@ config MD5_SMALL
 	default 1  # all "fast or small" options default to small
 	range 0 3
 	help
-	Trade binary size versus speed for the md5sum algorithm.
+	Trade binary size versus speed for the md5 algorithm.
 	Approximate values running uClibc and hashing
 	linux-2.4.4.tar.bz2 were:
-	value               user times (sec)  text size (386)
-	0 (fastest)         1.1                6144
-	1                   1.4                5392
-	2                   3.0                5088
-	3 (smallest)        5.1                4912
+	value           user times (sec)  text size (386)
+	0 (fastest)     1.1               6144
+	1               1.4               5392
+	2               3.0               5088
+	3 (smallest)    5.1               4912
+
+config SHA1_SMALL
+	int "SHA1: Trade bytes for speed (0:fast, 3:slow)"
+	default 3  # all "fast or small" options default to small
+	range 0 3
+	help
+	Trade binary size versus speed for the sha1 algorithm.
+	                throughput MB/s   size of sha1_process_block64
+	value           486  x86-64       486   x86-64
+	0               339  374          4149  4167
+	1,2,3           200  195           358   380

 config SHA3_SMALL
 	int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
 	default 1  # all "fast or small" options default to small
 	range 0 1
 	help
-	Trade binary size versus speed for the sha3sum algorithm.
+	Trade binary size versus speed for the sha3 algorithm.
 	SHA3_SMALL=0 compared to SHA3_SMALL=1 (approximate):
 	64-bit x86: +270 bytes of code, 45% faster
 	32-bit x86: +450 bytes of code, 75% faster
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -390,7 +390,6 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
 	OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
 	OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
 	OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
-# undef OP
 # endif
 	/* Add checksum to the starting values */
 	ctx->hash[0] += A;
@@ -399,6 +398,7 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
 	ctx->hash[3] += D;
 #endif
 }
+#undef OP
 #undef FF
 #undef FG
 #undef FH
@@ -490,18 +490,87 @@ unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
 * then rebuild and compare "shaNNNsum bigfile" results.
 */

+#if CONFIG_SHA1_SMALL == 0
+/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86.
+ * It seems further speedup can be achieved by handling more than
+ * 64 bytes per one function call (coreutils does that).
+ */
+static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
+{
+	static const uint32_t rconsts[] ALIGN4 = {
+		0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
+	};
+	uint32_t W[16];
+	uint32_t a, b, c, d, e;
+
+	a = ctx->hash[0];
+	b = ctx->hash[1];
+	c = ctx->hash[2];
+	d = ctx->hash[3];
+	e = ctx->hash[4];
+
+#undef OP
+#define OP(A,B,C,D,E, n) \
+	do { \
+		uint32_t work = EXPR(B, C, D); \
+		if (n <= 15) \
+			work += W[n & 0xf] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
+		if (n >= 16) \
+			work += W[n & 0xf] = rotl32(W[(n+13) & 0xf] ^ W[(n+8) & 0xf] ^ W[(n+2) & 0xf] ^ W[n & 0xf], 1); \
+		E += work + rotl32(A, 5) + rconsts[n / 20]; \
+		B = rotl32(B, 30); \
+	} while (0)
+#define OP20(n) \
+	OP(a,b,c,d,e, (n+ 0)); OP(e,a,b,c,d, (n+ 1)); OP(d,e,a,b,c, (n+ 2)); OP(c,d,e,a,b, (n+ 3)); OP(b,c,d,e,a, (n+ 4)); \
+	OP(a,b,c,d,e, (n+ 5)); OP(e,a,b,c,d, (n+ 6)); OP(d,e,a,b,c, (n+ 7)); OP(c,d,e,a,b, (n+ 8)); OP(b,c,d,e,a, (n+ 9)); \
+	OP(a,b,c,d,e, (n+10)); OP(e,a,b,c,d, (n+11)); OP(d,e,a,b,c, (n+12)); OP(c,d,e,a,b, (n+13)); OP(b,c,d,e,a, (n+14)); \
+	OP(a,b,c,d,e, (n+15)); OP(e,a,b,c,d, (n+16)); OP(d,e,a,b,c, (n+17)); OP(c,d,e,a,b, (n+18)); OP(b,c,d,e,a, (n+19))
+
+	/* 4 rounds of 20 operations each */
+#define EXPR(b,c,d) (((c ^ d) & b) ^ d)
+	OP20(0);
+#undef EXPR
+#define EXPR(b,c,d) (c ^ d ^ b)
+	OP20(20);
+#undef EXPR
+#define EXPR(b,c,d) (((b | c) & d) | (b & c))
+	OP20(40);
+#undef EXPR
+#define EXPR(b,c,d) (c ^ d ^ b)
+	OP20(60);
+
+#undef EXPR
+#undef OP
+#undef OP20
+
+	ctx->hash[0] += a;
+	ctx->hash[1] += b;
+	ctx->hash[2] += c;
+	ctx->hash[3] += d;
+	ctx->hash[4] += e;
+}
+#else
+/* TODO: for CONFIG_SHA1_SMALL == 1, have a partially unrolled version? */
+
+/* Compact version, almost twice as slow as fully unrolled */
 static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 {
 	static const uint32_t rconsts[] ALIGN4 = {
 		0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
 	};
 	int i, j;
-	int cnt;
+	int n;
 	uint32_t W[16+16];
 	uint32_t a, b, c, d, e;

 	/* On-stack work buffer frees up one register in the main loop
-	 * which otherwise will be needed to hold ctx pointer */
+	 * which otherwise will be needed to hold ctx pointer.
+	 *
+	 * The compiler is not smart enough to realize it, though. :(
+	 * If __attribute__((optimize("2"))) is added to the function,
+	 * only then gcc-9.3.1 spills "ctx" to stack and uses the freed
+	 * register (making code 6 bytes smaller, not just faster).
+	 */
 	for (i = 0; i < 16; i++)
 		W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);

@@ -512,7 +581,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 	e = ctx->hash[4];

 	/* 4 rounds of 20 operations each */
-	cnt = 0;
+	n = 0;
 	for (i = 0; i < 4; i++) {
 		j = 19;
 		do {
@@ -529,9 +598,9 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 				else /* i = 1 or 3 */
 					work ^= b;
 ge16:
-				W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1);
+				W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
 			}
-			work += W[cnt];
+			work += W[n];
 			work += e + rotl32(a, 5) + rconsts[i];

 			/* Rotate by one for next time */
@@ -540,7 +609,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 			c = rotl32(b, 30);
 			b = a;
 			a = work;
-			cnt = (cnt + 1) & 15;
+			n = (n + 1) & 15;
 		} while (--j >= 0);
 	}

@@ -550,6 +619,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 	ctx->hash[3] += d;
 	ctx->hash[4] += e;
 }
+#endif

 /* Constants for SHA512 from FIPS 180-2:4.2.3.
 * SHA256 constants from FIPS 180-2:4.2.2