libbb/sha1: add config-selectable partially unrolled version
function old new delta sha1_process_block64 364 732 +368 static.rconsts 16 - -16 ------------------------------------------------------------------------------ (add/remove: 0/1 grow/shrink: 1/0 up/down: 368/-16) Total: 352 bytes Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
parent
25aadc893d
commit
0b62a08777
@ -60,7 +60,8 @@ config SHA1_SMALL
|
|||||||
throughput MB/s size of sha1_process_block64
|
throughput MB/s size of sha1_process_block64
|
||||||
value 486 x86-64 486 x86-64
|
value 486 x86-64 486 x86-64
|
||||||
0 339 374 4149 4167
|
0 339 374 4149 4167
|
||||||
1,2,3 200 195 358 380
|
1 224 229 654 732
|
||||||
|
2,3 200 195 358 380
|
||||||
|
|
||||||
config SHA3_SMALL
|
config SHA3_SMALL
|
||||||
int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
|
int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
|
||||||
|
@ -514,9 +514,9 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
|
|||||||
do { \
|
do { \
|
||||||
uint32_t work = EXPR(B, C, D); \
|
uint32_t work = EXPR(B, C, D); \
|
||||||
if (n <= 15) \
|
if (n <= 15) \
|
||||||
work += W[n & 0xf] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
|
work += W[n & 15] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
|
||||||
if (n >= 16) \
|
if (n >= 16) \
|
||||||
work += W[n & 0xf] = rotl32(W[(n+13) & 0xf] ^ W[(n+8) & 0xf] ^ W[(n+2) & 0xf] ^ W[n & 0xf], 1); \
|
work += W[n & 15] = rotl32(W[(n+13) & 15] ^ W[(n+8) & 15] ^ W[(n+2) & 15] ^ W[n & 15], 1); \
|
||||||
E += work + rotl32(A, 5) + rconsts[n / 20]; \
|
E += work + rotl32(A, 5) + rconsts[n / 20]; \
|
||||||
B = rotl32(B, 30); \
|
B = rotl32(B, 30); \
|
||||||
} while (0)
|
} while (0)
|
||||||
@ -549,9 +549,101 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
|
|||||||
ctx->hash[3] += d;
|
ctx->hash[3] += d;
|
||||||
ctx->hash[4] += e;
|
ctx->hash[4] += e;
|
||||||
}
|
}
|
||||||
#else
|
#elif CONFIG_SHA1_SMALL == 1
|
||||||
/* TODO: for CONFIG_SHA1_SMALL == 1, have a partially unrolled version? */
|
/* Middle-sized version, +300 bytes of code on x86. */
|
||||||
|
static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
|
||||||
|
{
|
||||||
|
static const uint32_t rconsts[] ALIGN4 = {
|
||||||
|
0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
|
||||||
|
};
|
||||||
|
int j;
|
||||||
|
int n;
|
||||||
|
uint32_t W[16+16];
|
||||||
|
uint32_t a, b, c, d, e;
|
||||||
|
|
||||||
|
a = ctx->hash[0];
|
||||||
|
b = ctx->hash[1];
|
||||||
|
c = ctx->hash[2];
|
||||||
|
d = ctx->hash[3];
|
||||||
|
e = ctx->hash[4];
|
||||||
|
|
||||||
|
/* 1st round of 20 operations */
|
||||||
|
n = 0;
|
||||||
|
do {
|
||||||
|
uint32_t work = ((c ^ d) & b) ^ d;
|
||||||
|
W[n] = W[n+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]);
|
||||||
|
work += W[n];
|
||||||
|
work += e + rotl32(a, 5) + rconsts[0];
|
||||||
|
/* Rotate by one for next time */
|
||||||
|
e = d;
|
||||||
|
d = c;
|
||||||
|
c = rotl32(b, 30);
|
||||||
|
b = a;
|
||||||
|
a = work;
|
||||||
|
n = (n + 1) & 15;
|
||||||
|
} while (n != 0);
|
||||||
|
do {
|
||||||
|
uint32_t work = ((c ^ d) & b) ^ d;
|
||||||
|
W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
|
||||||
|
work += W[n];
|
||||||
|
work += e + rotl32(a, 5) + rconsts[0];
|
||||||
|
e = d;
|
||||||
|
d = c;
|
||||||
|
c = rotl32(b, 30);
|
||||||
|
b = a;
|
||||||
|
a = work;
|
||||||
|
n = (n + 1) & 15;
|
||||||
|
} while (n != 4);
|
||||||
|
/* 2nd round of 20 operations */
|
||||||
|
j = 19;
|
||||||
|
do {
|
||||||
|
uint32_t work = c ^ d ^ b;
|
||||||
|
W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
|
||||||
|
work += W[n];
|
||||||
|
work += e + rotl32(a, 5) + rconsts[1];
|
||||||
|
e = d;
|
||||||
|
d = c;
|
||||||
|
c = rotl32(b, 30);
|
||||||
|
b = a;
|
||||||
|
a = work;
|
||||||
|
n = (n + 1) & 15;
|
||||||
|
} while (--j >= 0);
|
||||||
|
/* 3rd round */
|
||||||
|
j = 19;
|
||||||
|
do {
|
||||||
|
uint32_t work = ((b | c) & d) | (b & c);
|
||||||
|
W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
|
||||||
|
work += W[n];
|
||||||
|
work += e + rotl32(a, 5) + rconsts[2];
|
||||||
|
e = d;
|
||||||
|
d = c;
|
||||||
|
c = rotl32(b, 30);
|
||||||
|
b = a;
|
||||||
|
a = work;
|
||||||
|
n = (n + 1) & 15;
|
||||||
|
} while (--j >= 0);
|
||||||
|
/* 4th round */
|
||||||
|
j = 19;
|
||||||
|
do {
|
||||||
|
uint32_t work = c ^ d ^ b;
|
||||||
|
W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
|
||||||
|
work += W[n];
|
||||||
|
work += e + rotl32(a, 5) + rconsts[3];
|
||||||
|
e = d;
|
||||||
|
d = c;
|
||||||
|
c = rotl32(b, 30);
|
||||||
|
b = a;
|
||||||
|
a = work;
|
||||||
|
n = (n + 1) & 15;
|
||||||
|
} while (--j >= 0);
|
||||||
|
|
||||||
|
ctx->hash[0] += a;
|
||||||
|
ctx->hash[1] += b;
|
||||||
|
ctx->hash[2] += c;
|
||||||
|
ctx->hash[3] += d;
|
||||||
|
ctx->hash[4] += e;
|
||||||
|
}
|
||||||
|
#else
|
||||||
/* Compact version, almost twice as slow as fully unrolled */
|
/* Compact version, almost twice as slow as fully unrolled */
|
||||||
static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
|
static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
|
||||||
{
|
{
|
||||||
|
Loading…
x
Reference in New Issue
Block a user