libbb/sha1: shrink and speed up fully unrolled version
function old new delta sha1_process_block64 4149 3950 -199 Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
parent
0b62a08777
commit
f09d088fdf
@ -59,7 +59,7 @@ config SHA1_SMALL
|
||||
Trade binary size versus speed for the sha1 algorithm.
|
||||
throughput MB/s size of sha1_process_block64
|
||||
value 486 x86-64 486 x86-64
|
||||
0 339 374 4149 4167
|
||||
0 360 374 3950 4167
|
||||
1 224 229 654 732
|
||||
2,3 200 195 358 380
|
||||
|
||||
|
@ -509,6 +509,27 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
|
||||
d = ctx->hash[3];
|
||||
e = ctx->hash[4];
|
||||
|
||||
/* From kernel source comments:
|
||||
* """
|
||||
* If you have 32 registers or more, the compiler can (and should)
|
||||
* try to change the array[] accesses into registers. However, on
|
||||
* machines with less than ~25 registers, that won't really work,
|
||||
* and at least gcc will make an unholy mess of it.
|
||||
*
|
||||
* So to avoid that mess which just slows things down, we force
|
||||
* the stores to memory to actually happen (we might be better off
|
||||
* with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as
|
||||
* suggested by Artur Skawina - that will also make gcc unable to
|
||||
* try to do the silly "optimize away loads" part because it won't
|
||||
* see what the value will be).
|
||||
* """
|
||||
*/
|
||||
#if defined(__i386__)
|
||||
# define DO_NOT_TRY_PROPAGATING(m) asm("":"+m"(m))
|
||||
#else
|
||||
# define DO_NOT_TRY_PROPAGATING(m) ((void)0)
|
||||
#endif
|
||||
|
||||
#undef OP
|
||||
#define OP(A,B,C,D,E, n) \
|
||||
do { \
|
||||
@ -517,6 +538,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
|
||||
work += W[n & 15] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
|
||||
if (n >= 16) \
|
||||
work += W[n & 15] = rotl32(W[(n+13) & 15] ^ W[(n+8) & 15] ^ W[(n+2) & 15] ^ W[n & 15], 1); \
|
||||
DO_NOT_TRY_PROPAGATING(W[n & 15]); \
|
||||
E += work + rotl32(A, 5) + rconsts[n / 20]; \
|
||||
B = rotl32(B, 30); \
|
||||
} while (0)
|
||||
|
Loading…
Reference in New Issue
Block a user