diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index 3b1366762..a0eec7789 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c @@ -988,24 +988,29 @@ static void KeccakF(uint64_t *state) for (round = 0; round < cKeccakNumberOfRounds; ++round) { /* Theta */ { - uint64_t BC[5]; + uint64_t BC[10]; for (x = 0; x < 5; ++x) { - BC[x] = state[x] ^ state[5 + x] ^ state[10 + x] ^ - state[15 + x] ^ state[20 + x]; + BC[x + 5] = BC[x] = state[x] + ^ state[x + 5] ^ state[x + 10] + ^ state[x + 15] ^ state[x + 20]; } + /* Using 2x5 vector above eliminates the need to use + * [Mod5[x+N]] index trick below to calculate (x+N) % 5, + * and the code is a bit _smaller_. + */ for (x = 0; x < 5; ++x) { - uint64_t temp = BC[KeccakF_Mod5[x + 4]] ^ - rotl64(BC[KeccakF_Mod5[x + 1]], 1); + uint64_t temp = BC[x + 4] ^ rotl64(BC[x + 1], 1); if (SHA3_SMALL && !ARCH_IS_64BIT) { for (y = 0; y <= 20; y += 5) - state[y + x] ^= temp; + state[x + y] ^= temp; } else { - /* on 64-bit arch, this is actually smaller too */ - state[0 + x] ^= temp; - state[5 + x] ^= temp; - state[10 + x] ^= temp; - state[15 + x] ^= temp; - state[20 + x] ^= temp; + /* On 64-bit, this is also smaller, + * not only faster, than loop */ + state[x] ^= temp; + state[x + 5] ^= temp; + state[x + 10] ^= temp; + state[x + 15] ^= temp; + state[x + 20] ^= temp; } } } @@ -1019,7 +1024,7 @@ static void KeccakF(uint64_t *state) t1 = t0; } } else { - /* Especially large benefit for 32-bit arch: + /* Especially large benefit for 32-bit arch (75% faster): * 64-bit rotations by non-constant usually are SLOW on those. * We resort to unrolling here. * This optimizes out KeccakF_PiLane[] and KeccakF_RotationConstants[],