libbb/sha1: add config-selectable fully unrolled version, closes 14391

function                                             old     new   delta
sha1_process_block64                                 364    4167   +3803
static.rconsts                                        16       -     -16
------------------------------------------------------------------------------
(add/remove: 0/1 grow/shrink: 1/0 up/down: 3803/-16)         Total: 3787 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
Denys Vlasenko 2021-12-30 13:07:12 +01:00
parent 9173c9cce4
commit 25aadc893d
2 changed files with 95 additions and 14 deletions

View File

@ -42,21 +42,32 @@ config MD5_SMALL
default 1 # all "fast or small" options default to small
range 0 3
help
Trade binary size versus speed for the md5sum algorithm.
Trade binary size versus speed for the md5 algorithm.
Approximate values running uClibc and hashing
linux-2.4.4.tar.bz2 were:
value user times (sec) text size (386)
0 (fastest) 1.1 6144
1 1.4 5392
2 3.0 5088
3 (smallest) 5.1 4912
value user times (sec) text size (386)
0 (fastest) 1.1 6144
1 1.4 5392
2 3.0 5088
3 (smallest) 5.1 4912
config SHA1_SMALL
int "SHA1: Trade bytes for speed (0:fast, 3:slow)"
default 3 # all "fast or small" options default to small
range 0 3
help
Trade binary size versus speed for the sha1 algorithm.
throughput MB/s size of sha1_process_block64
value 486 x86-64 486 x86-64
0 339 374 4149 4167
1,2,3 200 195 358 380
config SHA3_SMALL
int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
default 1 # all "fast or small" options default to small
range 0 1
help
Trade binary size versus speed for the sha3sum algorithm.
Trade binary size versus speed for the sha3 algorithm.
SHA3_SMALL=0 compared to SHA3_SMALL=1 (approximate):
64-bit x86: +270 bytes of code, 45% faster
32-bit x86: +450 bytes of code, 75% faster

View File

@ -390,7 +390,6 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
# undef OP
# endif
/* Add checksum to the starting values */
ctx->hash[0] += A;
@ -399,6 +398,7 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
ctx->hash[3] += D;
#endif
}
#undef OP
#undef FF
#undef FG
#undef FH
@ -490,18 +490,87 @@ unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
* then rebuild and compare "shaNNNsum bigfile" results.
*/
#if CONFIG_SHA1_SMALL == 0
/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86.
* It seems further speedup can be achieved by handling more than
* 64 bytes per one function call (coreutils does that).
*/
static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
{
static const uint32_t rconsts[] ALIGN4 = {
0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
};
uint32_t W[16];
uint32_t a, b, c, d, e;
a = ctx->hash[0];
b = ctx->hash[1];
c = ctx->hash[2];
d = ctx->hash[3];
e = ctx->hash[4];
#undef OP
#define OP(A,B,C,D,E, n) \
do { \
uint32_t work = EXPR(B, C, D); \
if (n <= 15) \
work += W[n & 0xf] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
if (n >= 16) \
work += W[n & 0xf] = rotl32(W[(n+13) & 0xf] ^ W[(n+8) & 0xf] ^ W[(n+2) & 0xf] ^ W[n & 0xf], 1); \
E += work + rotl32(A, 5) + rconsts[n / 20]; \
B = rotl32(B, 30); \
} while (0)
#define OP20(n) \
OP(a,b,c,d,e, (n+ 0)); OP(e,a,b,c,d, (n+ 1)); OP(d,e,a,b,c, (n+ 2)); OP(c,d,e,a,b, (n+ 3)); OP(b,c,d,e,a, (n+ 4)); \
OP(a,b,c,d,e, (n+ 5)); OP(e,a,b,c,d, (n+ 6)); OP(d,e,a,b,c, (n+ 7)); OP(c,d,e,a,b, (n+ 8)); OP(b,c,d,e,a, (n+ 9)); \
OP(a,b,c,d,e, (n+10)); OP(e,a,b,c,d, (n+11)); OP(d,e,a,b,c, (n+12)); OP(c,d,e,a,b, (n+13)); OP(b,c,d,e,a, (n+14)); \
OP(a,b,c,d,e, (n+15)); OP(e,a,b,c,d, (n+16)); OP(d,e,a,b,c, (n+17)); OP(c,d,e,a,b, (n+18)); OP(b,c,d,e,a, (n+19))
/* 4 rounds of 20 operations each */
#define EXPR(b,c,d) (((c ^ d) & b) ^ d)
OP20(0);
#undef EXPR
#define EXPR(b,c,d) (c ^ d ^ b)
OP20(20);
#undef EXPR
#define EXPR(b,c,d) (((b | c) & d) | (b & c))
OP20(40);
#undef EXPR
#define EXPR(b,c,d) (c ^ d ^ b)
OP20(60);
#undef EXPR
#undef OP
#undef OP20
ctx->hash[0] += a;
ctx->hash[1] += b;
ctx->hash[2] += c;
ctx->hash[3] += d;
ctx->hash[4] += e;
}
#else
/* TODO: for CONFIG_SHA1_SMALL == 1, have a partially unrolled version? */
/* Compact version, almost twice as slow as fully unrolled */
static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
{
static const uint32_t rconsts[] ALIGN4 = {
0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
};
int i, j;
int cnt;
int n;
uint32_t W[16+16];
uint32_t a, b, c, d, e;
/* On-stack work buffer frees up one register in the main loop
* which otherwise will be needed to hold ctx pointer */
* which otherwise will be needed to hold ctx pointer.
*
* The compiler is not smart enough to realize it, though. :(
* If __attribute__((optimize("2"))) is added to the function,
* only then gcc-9.3.1 spills "ctx" to stack and uses the freed
* register (making code 6 bytes smaller, not just faster).
*/
for (i = 0; i < 16; i++)
W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
@ -512,7 +581,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
e = ctx->hash[4];
/* 4 rounds of 20 operations each */
cnt = 0;
n = 0;
for (i = 0; i < 4; i++) {
j = 19;
do {
@ -529,9 +598,9 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
else /* i = 1 or 3 */
work ^= b;
ge16:
W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1);
W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
}
work += W[cnt];
work += W[n];
work += e + rotl32(a, 5) + rconsts[i];
/* Rotate by one for next time */
@ -540,7 +609,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
c = rotl32(b, 30);
b = a;
a = work;
cnt = (cnt + 1) & 15;
n = (n + 1) & 15;
} while (--j >= 0);
}
@ -550,6 +619,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
ctx->hash[3] += d;
ctx->hash[4] += e;
}
#endif
/* Constants for SHA512 from FIPS 180-2:4.2.3.
* SHA256 constants from FIPS 180-2:4.2.2