libbb/sha1: add config-selectable fully unrolled version, closes 14391
function old new delta sha1_process_block64 364 4167 +3803 static.rconsts 16 - -16 ------------------------------------------------------------------------------ (add/remove: 0/1 grow/shrink: 1/0 up/down: 3803/-16) Total: 3787 bytes Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
		| @@ -42,21 +42,32 @@ config MD5_SMALL | ||||
| 	default 1  # all "fast or small" options default to small | ||||
| 	range 0 3 | ||||
| 	help | ||||
| 	Trade binary size versus speed for the md5sum algorithm. | ||||
| 	Trade binary size versus speed for the md5 algorithm. | ||||
| 	Approximate values running uClibc and hashing | ||||
| 	linux-2.4.4.tar.bz2 were: | ||||
| 	value               user times (sec)  text size (386) | ||||
| 	0 (fastest)         1.1                6144 | ||||
| 	1                   1.4                5392 | ||||
| 	2                   3.0                5088 | ||||
| 	3 (smallest)        5.1                4912 | ||||
| 	value           user times (sec)  text size (386) | ||||
| 	0 (fastest)     1.1               6144 | ||||
| 	1               1.4               5392 | ||||
| 	2               3.0               5088 | ||||
| 	3 (smallest)    5.1               4912 | ||||
|  | ||||
| config SHA1_SMALL | ||||
| 	int "SHA1: Trade bytes for speed (0:fast, 3:slow)" | ||||
| 	default 3  # all "fast or small" options default to small | ||||
| 	range 0 3 | ||||
| 	help | ||||
| 	Trade binary size versus speed for the sha1 algorithm. | ||||
| 	                throughput MB/s   size of sha1_process_block64 | ||||
| 	value           486  x86-64       486   x86-64 | ||||
| 	0               339  374          4149  4167 | ||||
| 	1,2,3           200  195           358   380 | ||||
|  | ||||
| config SHA3_SMALL | ||||
| 	int "SHA3: Trade bytes for speed (0:fast, 1:slow)" | ||||
| 	default 1  # all "fast or small" options default to small | ||||
| 	range 0 1 | ||||
| 	help | ||||
| 	Trade binary size versus speed for the sha3sum algorithm. | ||||
| 	Trade binary size versus speed for the sha3 algorithm. | ||||
| 	SHA3_SMALL=0 compared to SHA3_SMALL=1 (approximate): | ||||
| 	64-bit x86: +270 bytes of code, 45% faster | ||||
| 	32-bit x86: +450 bytes of code, 75% faster | ||||
|   | ||||
| @@ -390,7 +390,6 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx) | ||||
| 	OP(FI, D, A, B, C, 11, 10, 0xbd3af235); | ||||
| 	OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb); | ||||
| 	OP(FI, B, C, D, A, 9, 21, 0xeb86d391); | ||||
| # undef OP | ||||
| # endif | ||||
| 	/* Add checksum to the starting values */ | ||||
| 	ctx->hash[0] += A; | ||||
| @@ -399,6 +398,7 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx) | ||||
| 	ctx->hash[3] += D; | ||||
| #endif | ||||
| } | ||||
| #undef OP | ||||
| #undef FF | ||||
| #undef FG | ||||
| #undef FH | ||||
| @@ -490,18 +490,87 @@ unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf) | ||||
|  * then rebuild and compare "shaNNNsum bigfile" results. | ||||
|  */ | ||||
|  | ||||
| #if CONFIG_SHA1_SMALL == 0 | ||||
| /* Fast, fully-unrolled SHA1. +3800 bytes of code on x86. | ||||
|  * It seems further speedup can be achieved by handling more than | ||||
|  * 64 bytes per one function call (coreutils does that). | ||||
|  */ | ||||
| static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) | ||||
| { | ||||
| 	static const uint32_t rconsts[] ALIGN4 = { | ||||
| 		0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6 | ||||
| 	}; | ||||
| 	uint32_t W[16]; | ||||
| 	uint32_t a, b, c, d, e; | ||||
|  | ||||
| 	a = ctx->hash[0]; | ||||
| 	b = ctx->hash[1]; | ||||
| 	c = ctx->hash[2]; | ||||
| 	d = ctx->hash[3]; | ||||
| 	e = ctx->hash[4]; | ||||
|  | ||||
| #undef OP | ||||
| #define OP(A,B,C,D,E, n) \ | ||||
| 	do { \ | ||||
| 		uint32_t work = EXPR(B, C, D); \ | ||||
| 		if (n <= 15) \ | ||||
| 			work += W[n & 0xf] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \ | ||||
| 		if (n >= 16) \ | ||||
| 			work += W[n & 0xf] = rotl32(W[(n+13) & 0xf] ^ W[(n+8) & 0xf] ^ W[(n+2) & 0xf] ^ W[n & 0xf], 1); \ | ||||
| 		E += work + rotl32(A, 5) + rconsts[n / 20]; \ | ||||
| 		B = rotl32(B, 30); \ | ||||
| 	} while (0) | ||||
| #define OP20(n) \ | ||||
| 	OP(a,b,c,d,e, (n+ 0)); OP(e,a,b,c,d, (n+ 1)); OP(d,e,a,b,c, (n+ 2)); OP(c,d,e,a,b, (n+ 3)); OP(b,c,d,e,a, (n+ 4)); \ | ||||
| 	OP(a,b,c,d,e, (n+ 5)); OP(e,a,b,c,d, (n+ 6)); OP(d,e,a,b,c, (n+ 7)); OP(c,d,e,a,b, (n+ 8)); OP(b,c,d,e,a, (n+ 9)); \ | ||||
| 	OP(a,b,c,d,e, (n+10)); OP(e,a,b,c,d, (n+11)); OP(d,e,a,b,c, (n+12)); OP(c,d,e,a,b, (n+13)); OP(b,c,d,e,a, (n+14)); \ | ||||
| 	OP(a,b,c,d,e, (n+15)); OP(e,a,b,c,d, (n+16)); OP(d,e,a,b,c, (n+17)); OP(c,d,e,a,b, (n+18)); OP(b,c,d,e,a, (n+19)) | ||||
|  | ||||
| 	/* 4 rounds of 20 operations each */ | ||||
| #define EXPR(b,c,d) (((c ^ d) & b) ^ d) | ||||
| 	OP20(0); | ||||
| #undef EXPR | ||||
| #define EXPR(b,c,d) (c ^ d ^ b) | ||||
| 	OP20(20); | ||||
| #undef EXPR | ||||
| #define EXPR(b,c,d) (((b | c) & d) | (b & c)) | ||||
| 	OP20(40); | ||||
| #undef EXPR | ||||
| #define EXPR(b,c,d) (c ^ d ^ b) | ||||
| 	OP20(60); | ||||
|  | ||||
| #undef EXPR | ||||
| #undef OP | ||||
| #undef OP20 | ||||
|  | ||||
| 	ctx->hash[0] += a; | ||||
| 	ctx->hash[1] += b; | ||||
| 	ctx->hash[2] += c; | ||||
| 	ctx->hash[3] += d; | ||||
| 	ctx->hash[4] += e; | ||||
| } | ||||
| #else | ||||
| /* TODO: for CONFIG_SHA1_SMALL == 1, have a partially unrolled version? */ | ||||
|  | ||||
| /* Compact version, almost twice as slow as fully unrolled */ | ||||
| static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) | ||||
| { | ||||
| 	static const uint32_t rconsts[] ALIGN4 = { | ||||
| 		0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6 | ||||
| 	}; | ||||
| 	int i, j; | ||||
| 	int cnt; | ||||
| 	int n; | ||||
| 	uint32_t W[16+16]; | ||||
| 	uint32_t a, b, c, d, e; | ||||
|  | ||||
| 	/* On-stack work buffer frees up one register in the main loop | ||||
| 	 * which otherwise will be needed to hold ctx pointer */ | ||||
| 	 * which otherwise will be needed to hold ctx pointer. | ||||
| 	 * | ||||
| 	 * The compiler is not smart enough to realize it, though. :( | ||||
| 	 * If __attribute__((optimize("2"))) is added to the function, | ||||
| 	 * only then gcc-9.3.1 spills "ctx" to stack and uses the freed | ||||
| 	 * register (making code 6 bytes smaller, not just faster). | ||||
| 	 */ | ||||
| 	for (i = 0; i < 16; i++) | ||||
| 		W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]); | ||||
|  | ||||
| @@ -512,7 +581,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) | ||||
| 	e = ctx->hash[4]; | ||||
|  | ||||
| 	/* 4 rounds of 20 operations each */ | ||||
| 	cnt = 0; | ||||
| 	n = 0; | ||||
| 	for (i = 0; i < 4; i++) { | ||||
| 		j = 19; | ||||
| 		do { | ||||
| @@ -529,9 +598,9 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) | ||||
| 				else /* i = 1 or 3 */ | ||||
| 					work ^= b; | ||||
|  ge16: | ||||
| 				W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1); | ||||
| 				W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1); | ||||
| 			} | ||||
| 			work += W[cnt]; | ||||
| 			work += W[n]; | ||||
| 			work += e + rotl32(a, 5) + rconsts[i]; | ||||
|  | ||||
| 			/* Rotate by one for next time */ | ||||
| @@ -540,7 +609,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) | ||||
| 			c = rotl32(b, 30); | ||||
| 			b = a; | ||||
| 			a = work; | ||||
| 			cnt = (cnt + 1) & 15; | ||||
| 			n = (n + 1) & 15; | ||||
| 		} while (--j >= 0); | ||||
| 	} | ||||
|  | ||||
| @@ -550,6 +619,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) | ||||
| 	ctx->hash[3] += d; | ||||
| 	ctx->hash[4] += e; | ||||
| } | ||||
| #endif | ||||
|  | ||||
| /* Constants for SHA512 from FIPS 180-2:4.2.3. | ||||
|  * SHA256 constants from FIPS 180-2:4.2.2 | ||||
|   | ||||
		Reference in New Issue
	
	Block a user