libbb/sha1: assembly versions for x86

32 bits:
function                                             old     new   delta
sha1_process_block64                                3950    3657    -293
64 bits:
sha1_process_block64                                4167    3683    -484

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
Denys Vlasenko 2022-01-01 12:21:01 +01:00
parent f09d088fdf
commit 5f68170204
2 changed files with 418 additions and 1 deletions

View File

@ -59,7 +59,7 @@ config SHA1_SMALL
Trade binary size versus speed for the sha1 algorithm.
throughput MB/s size of sha1_process_block64
value 486 x86-64 486 x86-64
0 360 374 3950 4167
0 367 367 3657 3683
1 224 229 654 732
2,3 200 195 358 380

View File

@ -8,6 +8,9 @@
*/
#include "libbb.h"
#define STR1(s) #s
#define STR(s) STR1(s)
#define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA)
/* gcc 4.2.1 optimizes rotr64 better with inline than with macro
@ -491,6 +494,419 @@ unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
*/
#if CONFIG_SHA1_SMALL == 0
# if defined(__GNUC__) && defined(__i386__)
static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
{
BUILD_BUG_ON(offsetof(sha1_ctx_t, hash) != 76);
asm(
"\n\
pushl %ebp # \n\
pushl %edi # \n\
pushl %esi # \n\
pushl %ebx # \n\
pushl %eax \n\
movl $15, %edi \n\
1: \n\
movl (%eax,%edi,4), %esi \n\
bswap %esi \n\
pushl %esi \n\
decl %edi \n\
jns 1b \n\
movl 80(%eax), %ebx # b = ctx->hash[1] \n\
movl 84(%eax), %ecx # c = ctx->hash[2] \n\
movl 88(%eax), %edx # d = ctx->hash[3] \n\
movl 92(%eax), %ebp # e = ctx->hash[4] \n\
movl 76(%eax), %eax # a = ctx->hash[0] \n\
#Register and stack use: \n\
# eax..edx: a..d \n\
# ebp: e \n\
# esi,edi: temps \n\
# 4*n(%esp): W[n] \n\
"
#define RD1As(a,b,c,d,e, n, RCONST) \
"\n\
##movl 4*"n"(%esp), %esi # n=0, W[0] already in %esi \n\
movl "c", %edi # c \n\
xorl "d", %edi # ^d \n\
andl "b", %edi # &b \n\
xorl "d", %edi # (((c ^ d) & b) ^ d) \n\
leal "RCONST"("e",%esi), "e" # e += RCONST + W[n] \n\
addl %edi, "e" # e += (((c ^ d) & b) ^ d) \n\
movl "a", %esi # \n\
roll $5, %esi # rotl32(a,5) \n\
addl %esi, "e" # e += rotl32(a,5) \n\
rorl $2, "b" # b = rotl32(b,30) \n\
"
#define RD1Bs(a,b,c,d,e, n, RCONST) \
"\n\
movl 4*"n"(%esp), %esi # W[n] \n\
movl "c", %edi # c \n\
xorl "d", %edi # ^d \n\
andl "b", %edi # &b \n\
xorl "d", %edi # (((c ^ d) & b) ^ d) \n\
leal "RCONST"("e",%esi), "e" # e += RCONST + W[n] \n\
addl %edi, "e" # e += (((c ^ d) & b) ^ d) \n\
movl "a", %esi # \n\
roll $5, %esi # rotl32(a,5) \n\
addl %esi, "e" # e += rotl32(a,5) \n\
rorl $2, "b" # b = rotl32(b,30) \n\
"
#define RD1Cs(a,b,c,d,e, n13,n8,n2,n, RCONST) \
"\n\
movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
roll %esi # \n\
movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\
movl "c", %edi # c \n\
xorl "d", %edi # ^d \n\
andl "b", %edi # &b \n\
xorl "d", %edi # (((c ^ d) & b) ^ d) \n\
leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
addl %edi, "e" # e += (((c ^ d) & b) ^ d) \n\
movl "a", %esi # \n\
roll $5, %esi # rotl32(a,5) \n\
addl %esi, "e" # e += rotl32(a,5) \n\
rorl $2, "b" # b = rotl32(b,30) \n\
"
#define RD1A(a,b,c,d,e, n) RD1As("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR((n)), STR(RCONST))
#define RD1B(a,b,c,d,e, n) RD1Bs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR((n)), STR(RCONST))
#define RD1C(a,b,c,d,e, n) RD1Cs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST))
#undef RCONST
#define RCONST 0x5A827999
RD1A(ax,bx,cx,dx,bp, 0) RD1B(bp,ax,bx,cx,dx, 1) RD1B(dx,bp,ax,bx,cx, 2) RD1B(cx,dx,bp,ax,bx, 3) RD1B(bx,cx,dx,bp,ax, 4)
RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1B(cx,dx,bp,ax,bx, 8) RD1B(bx,cx,dx,bp,ax, 9)
RD1B(ax,bx,cx,dx,bp,10) RD1B(bp,ax,bx,cx,dx,11) RD1B(dx,bp,ax,bx,cx,12) RD1B(cx,dx,bp,ax,bx,13) RD1B(bx,cx,dx,bp,ax,14)
RD1B(ax,bx,cx,dx,bp,15) RD1C(bp,ax,bx,cx,dx,16) RD1C(dx,bp,ax,bx,cx,17) RD1C(cx,dx,bp,ax,bx,18) RD1C(bx,cx,dx,bp,ax,19)
#define RD2s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
"\n\
movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
roll %esi # \n\
movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\
movl "c", %edi # c \n\
xorl "d", %edi # ^d \n\
xorl "b", %edi # ^b \n\
leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
addl %edi, "e" # e += (c ^ d ^ b) \n\
movl "a", %esi # \n\
roll $5, %esi # rotl32(a,5) \n\
addl %esi, "e" # e += rotl32(a,5) \n\
rorl $2, "b" # b = rotl32(b,30) \n\
"
#define RD2(a,b,c,d,e, n) RD2s("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((20+n+13)&15)), STR(((20+n+8)&15)), STR(((20+n+2)&15)), STR(((20+n)&15)), STR(RCONST))
#undef RCONST
#define RCONST 0x6ED9EBA1
RD2(ax,bx,cx,dx,bp, 0) RD2(bp,ax,bx,cx,dx, 1) RD2(dx,bp,ax,bx,cx, 2) RD2(cx,dx,bp,ax,bx, 3) RD2(bx,cx,dx,bp,ax, 4)
RD2(ax,bx,cx,dx,bp, 5) RD2(bp,ax,bx,cx,dx, 6) RD2(dx,bp,ax,bx,cx, 7) RD2(cx,dx,bp,ax,bx, 8) RD2(bx,cx,dx,bp,ax, 9)
RD2(ax,bx,cx,dx,bp,10) RD2(bp,ax,bx,cx,dx,11) RD2(dx,bp,ax,bx,cx,12) RD2(cx,dx,bp,ax,bx,13) RD2(bx,cx,dx,bp,ax,14)
RD2(ax,bx,cx,dx,bp,15) RD2(bp,ax,bx,cx,dx,16) RD2(dx,bp,ax,bx,cx,17) RD2(cx,dx,bp,ax,bx,18) RD2(bx,cx,dx,bp,ax,19)
#define RD3s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
"\n\
movl "b", %edi # di: b \n\
movl "b", %esi # si: b \n\
orl "c", %edi # di: b | c \n\
andl "c", %esi # si: b & c \n\
andl "d", %edi # di: (b | c) & d \n\
orl %esi, %edi # ((b | c) & d) | (b & c) \n\
movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
roll %esi # \n\
movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\
addl %edi, "e" # += ((b | c) & d) | (b & c)\n\
leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
movl "a", %esi # \n\
roll $5, %esi # rotl32(a,5) \n\
addl %esi, "e" # e += rotl32(a,5) \n\
rorl $2, "b" # b = rotl32(b,30) \n\
"
#define RD3(a,b,c,d,e, n) RD3s("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((40+n+13)&15)), STR(((40+n+8)&15)), STR(((40+n+2)&15)), STR(((40+n)&15)), STR(RCONST))
#undef RCONST
#define RCONST 0x8F1BBCDC
RD3(ax,bx,cx,dx,bp, 0) RD3(bp,ax,bx,cx,dx, 1) RD3(dx,bp,ax,bx,cx, 2) RD3(cx,dx,bp,ax,bx, 3) RD3(bx,cx,dx,bp,ax, 4)
RD3(ax,bx,cx,dx,bp, 5) RD3(bp,ax,bx,cx,dx, 6) RD3(dx,bp,ax,bx,cx, 7) RD3(cx,dx,bp,ax,bx, 8) RD3(bx,cx,dx,bp,ax, 9)
RD3(ax,bx,cx,dx,bp,10) RD3(bp,ax,bx,cx,dx,11) RD3(dx,bp,ax,bx,cx,12) RD3(cx,dx,bp,ax,bx,13) RD3(bx,cx,dx,bp,ax,14)
RD3(ax,bx,cx,dx,bp,15) RD3(bp,ax,bx,cx,dx,16) RD3(dx,bp,ax,bx,cx,17) RD3(cx,dx,bp,ax,bx,18) RD3(bx,cx,dx,bp,ax,19)
#define RD4As(a,b,c,d,e, n13,n8,n2,n, RCONST) \
"\n\
movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
roll %esi # \n\
movl %esi, 4*"n"(%esp) # store to W[n & 15] \n\
movl "c", %edi # c \n\
xorl "d", %edi # ^d \n\
xorl "b", %edi # ^b \n\
leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
addl %edi, "e" # e += (c ^ d ^ b) \n\
movl "a", %esi # \n\
roll $5, %esi # rotl32(a,5) \n\
addl %esi, "e" # e += rotl32(a,5) \n\
rorl $2, "b" # b = rotl32(b,30) \n\
"
#define RD4Bs(a,b,c,d,e, n13,n8,n2,n, RCONST) \
"\n\
movl 4*"n13"(%esp), %esi # W[(n+13) & 15] \n\
xorl 4*"n8"(%esp), %esi # ^W[(n+8) & 15] \n\
xorl 4*"n2"(%esp), %esi # ^W[(n+2) & 15] \n\
xorl 4*"n"(%esp), %esi # ^W[n & 15] \n\
roll %esi # \n\
##movl %esi, 4*"n"(%esp) # store to W[n & 15] elided \n\
movl "c", %edi # c \n\
xorl "d", %edi # ^d \n\
xorl "b", %edi # ^b \n\
leal "RCONST"("e",%esi), "e" # e += RCONST + mixed_W \n\
addl %edi, "e" # e += (c ^ d ^ b) \n\
movl "a", %esi # \n\
roll $5, %esi # rotl32(a,5) \n\
addl %esi, "e" # e += rotl32(a,5) \n\
rorl $2, "b" # b = rotl32(b,30) \n\
"
#define RD4A(a,b,c,d,e, n) RD4As("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST))
#define RD4B(a,b,c,d,e, n) RD4Bs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST))
#undef RCONST
#define RCONST 0xCA62C1D6
RD4A(ax,bx,cx,dx,bp, 0) RD4A(bp,ax,bx,cx,dx, 1) RD4A(dx,bp,ax,bx,cx, 2) RD4A(cx,dx,bp,ax,bx, 3) RD4A(bx,cx,dx,bp,ax, 4)
RD4A(ax,bx,cx,dx,bp, 5) RD4A(bp,ax,bx,cx,dx, 6) RD4A(dx,bp,ax,bx,cx, 7) RD4A(cx,dx,bp,ax,bx, 8) RD4A(bx,cx,dx,bp,ax, 9)
RD4A(ax,bx,cx,dx,bp,10) RD4A(bp,ax,bx,cx,dx,11) RD4A(dx,bp,ax,bx,cx,12) RD4A(cx,dx,bp,ax,bx,13) RD4A(bx,cx,dx,bp,ax,14)
RD4A(ax,bx,cx,dx,bp,15) RD4A(bp,ax,bx,cx,dx,16) RD4B(dx,bp,ax,bx,cx,17) RD4B(cx,dx,bp,ax,bx,18) RD4B(bx,cx,dx,bp,ax,19)
"\n\
movl 4*16(%esp), %esi # \n\
addl $4*(16+1), %esp # \n\
addl %eax, 76(%esi) # ctx->hash[0] += a \n\
addl %ebx, 80(%esi) # ctx->hash[1] += b \n\
addl %ecx, 84(%esi) # ctx->hash[2] += c \n\
addl %edx, 88(%esi) # ctx->hash[3] += d \n\
addl %ebp, 92(%esi) # ctx->hash[4] += e \n\
popl %ebx # \n\
popl %esi # \n\
popl %edi # \n\
popl %ebp # \n\
"
); /* asm */
#undef RCONST
}
# elif defined(__GNUC__) && defined(__x86_64__)
static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
{
BUILD_BUG_ON(offsetof(sha1_ctx_t, hash) != 80);
asm(
// TODO: store W[] in r8..r15? (r8..r11 are callee-clobbered, no need to save)
"\n\
##pushq %r15 # \n\
##pushq %r14 # \n\
##pushq %r13 # \n\
##pushq %r12 # \n\
##pushq %rbp # \n\
##pushq %rbx # \n\
movq %rbp, %r8 # callee-saved \n\
movq %rbx, %r9 # callee-saved \n\
movq %rdi, %r10 # we need ctx at the end \n\
movl $15, %eax \n\
1: \n\
movl (%rdi,%rax,4), %esi \n\
bswap %esi \n\
movl %esi, -64(%rsp,%rax,4) \n\
decl %eax \n\
jns 1b \n\
movl 80(%rdi), %eax # a = ctx->hash[0] \n\
movl 84(%rdi), %ebx # b = ctx->hash[1] \n\
movl 88(%rdi), %ecx # c = ctx->hash[2] \n\
movl 92(%rdi), %edx # d = ctx->hash[3] \n\
movl 96(%rdi), %ebp # e = ctx->hash[4] \n\
#Register and stack use: \n\
# eax..edx: a..d \n\
# ebp: e \n\
# esi,edi: temps \n\
# -64+4*n(%rsp): W[n] \n\
"
#define RD1As(a,b,c,d,e, n, RCONST) \
"\n\
##movl -64+4*"n"(%rsp), %esi # n=0, W[0] already in %esi \n\
movl %e"c", %edi # c \n\
xorl %e"d", %edi # ^d \n\
andl %e"b", %edi # &b \n\
xorl %e"d", %edi # (((c ^ d) & b) ^ d) \n\
leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + W[n] \n\
addl %edi, %e"e" # e += (((c ^ d) & b) ^ d) \n\
movl %e"a", %esi # \n\
roll $5, %esi # rotl32(a,5) \n\
addl %esi, %e"e" # e += rotl32(a,5) \n\
rorl $2, %e"b" # b = rotl32(b,30) \n\
"
#define RD1Bs(a,b,c,d,e, n, RCONST) \
"\n\
movl -64+4*"n"(%rsp), %esi # W[n] \n\
movl %e"c", %edi # c \n\
xorl %e"d", %edi # ^d \n\
andl %e"b", %edi # &b \n\
xorl %e"d", %edi # (((c ^ d) & b) ^ d) \n\
leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + W[n] \n\
addl %edi, %e"e" # e += (((c ^ d) & b) ^ d) \n\
movl %e"a", %esi # \n\
roll $5, %esi # rotl32(a,5) \n\
addl %esi, %e"e" # e += rotl32(a,5) \n\
rorl $2, %e"b" # b = rotl32(b,30) \n\
"
#define RD1Cs(a,b,c,d,e, n13,n8,n2,n, RCONST) \
"\n\
movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\
xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\
xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\
xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\
roll %esi # \n\
movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\
movl %e"c", %edi # c \n\
xorl %e"d", %edi # ^d \n\
andl %e"b", %edi # &b \n\
xorl %e"d", %edi # (((c ^ d) & b) ^ d) \n\
leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
addl %edi, %e"e" # e += (((c ^ d) & b) ^ d) \n\
movl %e"a", %esi # \n\
roll $5, %esi # rotl32(a,5) \n\
addl %esi, %e"e" # e += rotl32(a,5) \n\
rorl $2, %e"b" # b = rotl32(b,30) \n\
"
#define RD1A(a,b,c,d,e, n) RD1As(STR(a),STR(b),STR(c),STR(d),STR(e), STR((n)), STR(RCONST))
#define RD1B(a,b,c,d,e, n) RD1Bs(STR(a),STR(b),STR(c),STR(d),STR(e), STR((n)), STR(RCONST))
#define RD1C(a,b,c,d,e, n) RD1Cs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST))
#undef RCONST
#define RCONST 0x5A827999
RD1A(ax,bx,cx,dx,bp, 0) RD1B(bp,ax,bx,cx,dx, 1) RD1B(dx,bp,ax,bx,cx, 2) RD1B(cx,dx,bp,ax,bx, 3) RD1B(bx,cx,dx,bp,ax, 4)
RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1B(cx,dx,bp,ax,bx, 8) RD1B(bx,cx,dx,bp,ax, 9)
RD1B(ax,bx,cx,dx,bp,10) RD1B(bp,ax,bx,cx,dx,11) RD1B(dx,bp,ax,bx,cx,12) RD1B(cx,dx,bp,ax,bx,13) RD1B(bx,cx,dx,bp,ax,14)
RD1B(ax,bx,cx,dx,bp,15) RD1C(bp,ax,bx,cx,dx,16) RD1C(dx,bp,ax,bx,cx,17) RD1C(cx,dx,bp,ax,bx,18) RD1C(bx,cx,dx,bp,ax,19)
#define RD2s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
"\n\
movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\
xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\
xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\
xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\
roll %esi # \n\
movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\
movl %e"c", %edi # c \n\
xorl %e"d", %edi # ^d \n\
xorl %e"b", %edi # ^b \n\
leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
addl %edi, %e"e" # e += (c ^ d ^ b) \n\
movl %e"a", %esi # \n\
roll $5, %esi # rotl32(a,5) \n\
addl %esi, %e"e" # e += rotl32(a,5) \n\
rorl $2, %e"b" # b = rotl32(b,30) \n\
"
#define RD2(a,b,c,d,e, n) RD2s(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((20+n+13)&15)), STR(((20+n+8)&15)), STR(((20+n+2)&15)), STR(((20+n)&15)), STR(RCONST))
#undef RCONST
#define RCONST 0x6ED9EBA1
RD2(ax,bx,cx,dx,bp, 0) RD2(bp,ax,bx,cx,dx, 1) RD2(dx,bp,ax,bx,cx, 2) RD2(cx,dx,bp,ax,bx, 3) RD2(bx,cx,dx,bp,ax, 4)
RD2(ax,bx,cx,dx,bp, 5) RD2(bp,ax,bx,cx,dx, 6) RD2(dx,bp,ax,bx,cx, 7) RD2(cx,dx,bp,ax,bx, 8) RD2(bx,cx,dx,bp,ax, 9)
RD2(ax,bx,cx,dx,bp,10) RD2(bp,ax,bx,cx,dx,11) RD2(dx,bp,ax,bx,cx,12) RD2(cx,dx,bp,ax,bx,13) RD2(bx,cx,dx,bp,ax,14)
RD2(ax,bx,cx,dx,bp,15) RD2(bp,ax,bx,cx,dx,16) RD2(dx,bp,ax,bx,cx,17) RD2(cx,dx,bp,ax,bx,18) RD2(bx,cx,dx,bp,ax,19)
#define RD3s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
"\n\
movl %e"b", %edi # di: b \n\
movl %e"b", %esi # si: b \n\
orl %e"c", %edi # di: b | c \n\
andl %e"c", %esi # si: b & c \n\
andl %e"d", %edi # di: (b | c) & d \n\
orl %esi, %edi # ((b | c) & d) | (b & c) \n\
movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\
xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\
xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\
xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\
roll %esi # \n\
movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\
addl %edi, %e"e" # += ((b | c) & d) | (b & c)\n\
leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
movl %e"a", %esi # \n\
roll $5, %esi # rotl32(a,5) \n\
addl %esi, %e"e" # e += rotl32(a,5) \n\
rorl $2, %e"b" # b = rotl32(b,30) \n\
"
#define RD3(a,b,c,d,e, n) RD3s(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((40+n+13)&15)), STR(((40+n+8)&15)), STR(((40+n+2)&15)), STR(((40+n)&15)), STR(RCONST))
#undef RCONST
//#define RCONST 0x8F1BBCDC "out of range for signed 32bit displacement"
#define RCONST -0x70e44324
RD3(ax,bx,cx,dx,bp, 0) RD3(bp,ax,bx,cx,dx, 1) RD3(dx,bp,ax,bx,cx, 2) RD3(cx,dx,bp,ax,bx, 3) RD3(bx,cx,dx,bp,ax, 4)
RD3(ax,bx,cx,dx,bp, 5) RD3(bp,ax,bx,cx,dx, 6) RD3(dx,bp,ax,bx,cx, 7) RD3(cx,dx,bp,ax,bx, 8) RD3(bx,cx,dx,bp,ax, 9)
RD3(ax,bx,cx,dx,bp,10) RD3(bp,ax,bx,cx,dx,11) RD3(dx,bp,ax,bx,cx,12) RD3(cx,dx,bp,ax,bx,13) RD3(bx,cx,dx,bp,ax,14)
RD3(ax,bx,cx,dx,bp,15) RD3(bp,ax,bx,cx,dx,16) RD3(dx,bp,ax,bx,cx,17) RD3(cx,dx,bp,ax,bx,18) RD3(bx,cx,dx,bp,ax,19)
#define RD4As(a,b,c,d,e, n13,n8,n2,n, RCONST) \
"\n\
movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\
xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\
xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\
xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\
roll %esi # \n\
movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] \n\
movl %e"c", %edi # c \n\
xorl %e"d", %edi # ^d \n\
xorl %e"b", %edi # ^b \n\
leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
addl %edi, %e"e" # e += (c ^ d ^ b) \n\
movl %e"a", %esi # \n\
roll $5, %esi # rotl32(a,5) \n\
addl %esi, %e"e" # e += rotl32(a,5) \n\
rorl $2, %e"b" # b = rotl32(b,30) \n\
"
#define RD4Bs(a,b,c,d,e, n13,n8,n2,n, RCONST) \
"\n\
movl -64+4*"n13"(%rsp), %esi # W[(n+13) & 15] \n\
xorl -64+4*"n8"(%rsp), %esi # ^W[(n+8) & 15] \n\
xorl -64+4*"n2"(%rsp), %esi # ^W[(n+2) & 15] \n\
xorl -64+4*"n"(%rsp), %esi # ^W[n & 15] \n\
roll %esi # \n\
##movl %esi, -64+4*"n"(%rsp) # store to W[n & 15] elided \n\
movl %e"c", %edi # c \n\
xorl %e"d", %edi # ^d \n\
xorl %e"b", %edi # ^b \n\
leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
addl %edi, %e"e" # e += (c ^ d ^ b) \n\
movl %e"a", %esi # \n\
roll $5, %esi # rotl32(a,5) \n\
addl %esi, %e"e" # e += rotl32(a,5) \n\
rorl $2, %e"b" # b = rotl32(b,30) \n\
"
#define RD4A(a,b,c,d,e, n) RD4As(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST))
#define RD4B(a,b,c,d,e, n) RD4Bs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST))
#undef RCONST
//#define RCONST 0xCA62C1D6 "out of range for signed 32bit displacement"
#define RCONST -0x359d3e2a
RD4A(ax,bx,cx,dx,bp, 0) RD4A(bp,ax,bx,cx,dx, 1) RD4A(dx,bp,ax,bx,cx, 2) RD4A(cx,dx,bp,ax,bx, 3) RD4A(bx,cx,dx,bp,ax, 4)
RD4A(ax,bx,cx,dx,bp, 5) RD4A(bp,ax,bx,cx,dx, 6) RD4A(dx,bp,ax,bx,cx, 7) RD4A(cx,dx,bp,ax,bx, 8) RD4A(bx,cx,dx,bp,ax, 9)
RD4A(ax,bx,cx,dx,bp,10) RD4A(bp,ax,bx,cx,dx,11) RD4A(dx,bp,ax,bx,cx,12) RD4A(cx,dx,bp,ax,bx,13) RD4A(bx,cx,dx,bp,ax,14)
RD4A(ax,bx,cx,dx,bp,15) RD4A(bp,ax,bx,cx,dx,16) RD4B(dx,bp,ax,bx,cx,17) RD4B(cx,dx,bp,ax,bx,18) RD4B(bx,cx,dx,bp,ax,19)
"\n\
movq %r10, %rdi # \n\
addl %eax, 80(%rdi) # ctx->hash[0] += a \n\
addl %ebx, 84(%rdi) # ctx->hash[1] += b \n\
addl %ecx, 88(%rdi) # ctx->hash[2] += c \n\
addl %edx, 92(%rdi) # ctx->hash[3] += d \n\
addl %ebp, 96(%rdi) # ctx->hash[4] += e \n\
movq %r9, %rbx # callee-saved \n\
movq %r8, %rbp # callee-saved \n\
##popq %rbx # \n\
##popq %rbp # \n\
##popq %r12 # \n\
##popq %r13 # \n\
##popq %r14 # \n\
##popq %r15 # \n\
"
); /* asm */
#undef RCONST
}
# else
/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86.
* It seems further speedup can be achieved by handling more than
* 64 bytes per one function call (coreutils does that).
@ -571,6 +987,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
ctx->hash[3] += d;
ctx->hash[4] += e;
}
# endif
#elif CONFIG_SHA1_SMALL == 1
/* Middle-sized version, +300 bytes of code on x86. */
static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)