diff --git a/libbb/Config.src b/libbb/Config.src index c80bee286..708d3b0c8 100644 --- a/libbb/Config.src +++ b/libbb/Config.src @@ -63,6 +63,13 @@ config SHA1_SMALL 1 224 229 654 732 2,3 200 195 358 380 +config SHA1_HWACCEL + bool "SHA1: Use hardware accelerated instructions if possible" + default y + help + On x86, this adds ~590 bytes of code. Throughput + is about twice as fast as fully-unrolled generic code. + config SHA3_SMALL int "SHA3: Trade bytes for speed (0:fast, 1:slow)" default 1 # all "fast or small" options default to small diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src index 19b8aad60..a3db02b6f 100644 --- a/libbb/Kbuild.src +++ b/libbb/Kbuild.src @@ -57,6 +57,7 @@ lib-y += make_directory.o lib-y += makedev.o lib-y += hash_md5_sha.o lib-y += hash_md5_sha_x86-64.o +lib-y += hash_md5_sha_x86-64_shaNI.o # Alternative (disabled) MD5 implementation #lib-y += hash_md5prime.o lib-y += messages.o diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index ee19c1cb7..4c6904b48 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c @@ -699,7 +699,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) /* in hash_md5_sha_x86-64.S */ struct ASM_expects_80 { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; }; -void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM); +void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx); # else /* Fast, fully-unrolled SHA1. +3800 bytes of code on x86. @@ -1142,6 +1142,28 @@ static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx) } #endif /* NEED_SHA512 */ +#if ENABLE_SHA1_HWACCEL +# if defined(__GNUC__) && defined(__x86_64__) +static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) +{ + asm ( + "cpuid\n" + : "=a"(*eax), /* Output */ + "=b"(*ebx), + "=c"(*ecx), + "=d"(*edx) + : "0"(*eax), /* Input */ + "1"(*ebx), + "2"(*ecx), + "3"(*edx) + /* No clobbered registers */ + ); +} +struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; }; +void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx); +# endif +#endif + void FAST_FUNC sha1_begin(sha1_ctx_t *ctx) { ctx->hash[0] = 0x67452301; @@ -1151,6 +1173,20 @@ void FAST_FUNC sha1_begin(sha1_ctx_t *ctx) ctx->hash[4] = 0xc3d2e1f0; ctx->total64 = 0; ctx->process_block = sha1_process_block64; +#if ENABLE_SHA1_HWACCEL +# if defined(__GNUC__) && defined(__x86_64__) + { + static smallint shaNI; + if (!shaNI) { + unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx; + cpuid(&eax, &ebx, &ecx, &edx); + shaNI = ((ebx >> 28) & 2) - 1; + } + if (shaNI > 0) + ctx->process_block = sha1_process_block64_shaNI; + } +# endif +#endif } static const uint32_t init256[] ALIGN4 = { diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S new file mode 100644 index 000000000..473b472f1 --- /dev/null +++ b/libbb/hash_md5_sha_x86-64_shaNI.S @@ -0,0 +1,225 @@ +#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__x86_64__) +/* The code is adapted from Linux kernel's source */ + +// We use shorter insns, even though they are for "wrong" +// data type (fp, not int). +// For Intel, there is no penalty for doing it at all +// (CPUs which do have such penalty do not support SHA1 insns). +// For AMD, the penalty is one extra cycle +// (allegedly: I failed to find measurable difference). + +//#define mova128 movdqa +#define mova128 movaps +//#define movu128 movdqu +#define movu128 movups +//#define xor128 pxor +#define xor128 xorps +//#define shuf128_32 pshufd +#define shuf128_32 shufps + +#define extr128_32 pextrd +//#define extr128_32 extractps # not shorter + + .section .text.sha1_process_block64_shaNI,"ax",@progbits + .globl sha1_process_block64_shaNI + .hidden sha1_process_block64_shaNI + .type sha1_process_block64_shaNI, @function + +#define ABCD %xmm0 +#define E0 %xmm1 /* Need two E's b/c they ping pong */ +#define E1 %xmm2 +#define MSG0 %xmm3 +#define MSG1 %xmm4 +#define MSG2 %xmm5 +#define MSG3 %xmm6 +#define SHUF_MASK %xmm7 + + .balign 8 # allow decoders to fetch at least 2 first insns +sha1_process_block64_shaNI: + /* load initial hash values */ + + xor128 E0, E0 + movu128 80(%rdi), ABCD + pinsrd $3, 80+4*4(%rdi), E0 # load to upper 32-bit word + shuf128_32 $0x1B, ABCD, ABCD # 00011011: bswap + + mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK + + /* Save hash values for addition after rounds */ + mova128 E0, %xmm9 + mova128 ABCD, %xmm8 + + /* Rounds 0-3 */ + movu128 0*16(%rdi), MSG0 + pshufb SHUF_MASK, MSG0 + paddd MSG0, E0 + mova128 ABCD, E1 + sha1rnds4 $0, E0, ABCD + + /* Rounds 4-7 */ + movu128 1*16(%rdi), MSG1 + pshufb SHUF_MASK, MSG1 + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1rnds4 $0, E1, ABCD + sha1msg1 MSG1, MSG0 + + /* Rounds 8-11 */ + movu128 2*16(%rdi), MSG2 + pshufb SHUF_MASK, MSG2 + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1rnds4 $0, E0, ABCD + sha1msg1 MSG2, MSG1 + xor128 MSG2, MSG0 + + /* Rounds 12-15 */ + movu128 3*16(%rdi), MSG3 + pshufb SHUF_MASK, MSG3 + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $0, E1, ABCD + sha1msg1 MSG3, MSG2 + xor128 MSG3, MSG1 + + /* Rounds 16-19 */ + sha1nexte MSG0, E0 + mova128 ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $0, E0, ABCD + sha1msg1 MSG0, MSG3 + xor128 MSG0, MSG2 + + /* Rounds 20-23 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG1, MSG0 + xor128 MSG1, MSG3 + + /* Rounds 24-27 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $1, E0, ABCD + sha1msg1 MSG2, MSG1 + xor128 MSG2, MSG0 + + /* Rounds 28-31 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG3, MSG2 + xor128 MSG3, MSG1 + + /* Rounds 32-35 */ + sha1nexte MSG0, E0 + mova128 ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $1, E0, ABCD + sha1msg1 MSG0, MSG3 + xor128 MSG0, MSG2 + + /* Rounds 36-39 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG1, MSG0 + xor128 MSG1, MSG3 + + /* Rounds 40-43 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG2, MSG1 + xor128 MSG2, MSG0 + + /* Rounds 44-47 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $2, E1, ABCD + sha1msg1 MSG3, MSG2 + xor128 MSG3, MSG1 + + /* Rounds 48-51 */ + sha1nexte MSG0, E0 + mova128 ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG0, MSG3 + xor128 MSG0, MSG2 + + /* Rounds 52-55 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $2, E1, ABCD + sha1msg1 MSG1, MSG0 + xor128 MSG1, MSG3 + + /* Rounds 56-59 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG2, MSG1 + xor128 MSG2, MSG0 + + /* Rounds 60-63 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $3, E1, ABCD + sha1msg1 MSG3, MSG2 + xor128 MSG3, MSG1 + + /* Rounds 64-67 */ + sha1nexte MSG0, E0 + mova128 ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $3, E0, ABCD + sha1msg1 MSG0, MSG3 + xor128 MSG0, MSG2 + + /* Rounds 68-71 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $3, E1, ABCD + xor128 MSG1, MSG3 + + /* Rounds 72-75 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $3, E0, ABCD + + /* Rounds 76-79 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1rnds4 $3, E1, ABCD + + /* Add current hash values with previously saved */ + sha1nexte %xmm9, E0 + paddd %xmm8, ABCD + + /* Write hash values back in the correct order */ + shuf128_32 $0x1B, ABCD, ABCD + movu128 ABCD, 80(%rdi) + extr128_32 $3, E0, 80+4*4(%rdi) + + ret + .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI + +.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 +.align 16 +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x000102030405060708090a0b0c0d0e0f + +#endif