libbb/sha1: x86_64 version: reorder prologue/epilogue insns
Not clear exactly why, but this increases hashing speed on Skylake from 454 MB/s to 464 MB/s. Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
parent
1fc520ed28
commit
c3cfcc9242
@ -6,14 +6,14 @@
|
|||||||
.hidden sha1_process_block64
|
.hidden sha1_process_block64
|
||||||
.type sha1_process_block64, @function
|
.type sha1_process_block64, @function
|
||||||
|
|
||||||
.balign 8 # allow decoders to fetch at least 4 first insns
|
.balign 8 # allow decoders to fetch at least 5 first insns
|
||||||
sha1_process_block64:
|
sha1_process_block64:
|
||||||
pushq %r15 #
|
pushq %rbp # 1 byte insn
|
||||||
pushq %r14 #
|
pushq %rbx # 1 byte insn
|
||||||
pushq %r13 #
|
pushq %r15 # 2 byte insn
|
||||||
pushq %r12 #
|
pushq %r14 # 2 byte insn
|
||||||
pushq %rbp #
|
pushq %r13 # 2 byte insn
|
||||||
pushq %rbx #
|
pushq %r12 # 2 byte insn
|
||||||
pushq %rdi # we need ctx at the end
|
pushq %rdi # we need ctx at the end
|
||||||
|
|
||||||
#Register and stack use:
|
#Register and stack use:
|
||||||
@ -22,24 +22,6 @@ sha1_process_block64:
|
|||||||
# esi,edi: temps
|
# esi,edi: temps
|
||||||
# -32+4*n(%rsp),r8...r15: W[0..7,8..15]
|
# -32+4*n(%rsp),r8...r15: W[0..7,8..15]
|
||||||
# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?)
|
# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?)
|
||||||
|
|
||||||
movq 4*8(%rdi), %r8
|
|
||||||
bswapq %r8
|
|
||||||
movl %r8d, %r9d
|
|
||||||
shrq $32, %r8
|
|
||||||
movq 4*10(%rdi), %r10
|
|
||||||
bswapq %r10
|
|
||||||
movl %r10d, %r11d
|
|
||||||
shrq $32, %r10
|
|
||||||
movq 4*12(%rdi), %r12
|
|
||||||
bswapq %r12
|
|
||||||
movl %r12d, %r13d
|
|
||||||
shrq $32, %r12
|
|
||||||
movq 4*14(%rdi), %r14
|
|
||||||
bswapq %r14
|
|
||||||
movl %r14d, %r15d
|
|
||||||
shrq $32, %r14
|
|
||||||
|
|
||||||
movl $3, %eax
|
movl $3, %eax
|
||||||
1:
|
1:
|
||||||
movq (%rdi,%rax,8), %rsi
|
movq (%rdi,%rax,8), %rsi
|
||||||
@ -48,12 +30,30 @@ sha1_process_block64:
|
|||||||
movq %rsi, -32(%rsp,%rax,8)
|
movq %rsi, -32(%rsp,%rax,8)
|
||||||
decl %eax
|
decl %eax
|
||||||
jns 1b
|
jns 1b
|
||||||
|
|
||||||
movl 80(%rdi), %eax # a = ctx->hash[0]
|
movl 80(%rdi), %eax # a = ctx->hash[0]
|
||||||
movl 84(%rdi), %ebx # b = ctx->hash[1]
|
movl 84(%rdi), %ebx # b = ctx->hash[1]
|
||||||
movl 88(%rdi), %ecx # c = ctx->hash[2]
|
movl 88(%rdi), %ecx # c = ctx->hash[2]
|
||||||
movl 92(%rdi), %edx # d = ctx->hash[3]
|
movl 92(%rdi), %edx # d = ctx->hash[3]
|
||||||
movl 96(%rdi), %ebp # e = ctx->hash[4]
|
movl 96(%rdi), %ebp # e = ctx->hash[4]
|
||||||
|
|
||||||
|
movq 4*8(%rdi), %r8
|
||||||
|
movq 4*10(%rdi), %r10
|
||||||
|
bswapq %r8
|
||||||
|
bswapq %r10
|
||||||
|
movq 4*12(%rdi), %r12
|
||||||
|
movq 4*14(%rdi), %r14
|
||||||
|
bswapq %r12
|
||||||
|
bswapq %r14
|
||||||
|
movl %r8d, %r9d
|
||||||
|
shrq $32, %r8
|
||||||
|
movl %r10d, %r11d
|
||||||
|
shrq $32, %r10
|
||||||
|
movl %r12d, %r13d
|
||||||
|
shrq $32, %r12
|
||||||
|
movl %r14d, %r15d
|
||||||
|
shrq $32, %r14
|
||||||
|
|
||||||
# 0
|
# 0
|
||||||
# W[0], already in %esi
|
# W[0], already in %esi
|
||||||
movl %ecx, %edi # c
|
movl %ecx, %edi # c
|
||||||
@ -1272,17 +1272,17 @@ sha1_process_block64:
|
|||||||
rorl $2, %ecx # b = rotl32(b,30)
|
rorl $2, %ecx # b = rotl32(b,30)
|
||||||
|
|
||||||
popq %rdi #
|
popq %rdi #
|
||||||
addl %eax, 80(%rdi) # ctx->hash[0] += a
|
|
||||||
addl %ebx, 84(%rdi) # ctx->hash[1] += b
|
|
||||||
addl %ecx, 88(%rdi) # ctx->hash[2] += c
|
|
||||||
addl %edx, 92(%rdi) # ctx->hash[3] += d
|
|
||||||
addl %ebp, 96(%rdi) # ctx->hash[4] += e
|
|
||||||
popq %rbx #
|
|
||||||
popq %rbp #
|
|
||||||
popq %r12 #
|
popq %r12 #
|
||||||
|
addl %eax, 80(%rdi) # ctx->hash[0] += a
|
||||||
popq %r13 #
|
popq %r13 #
|
||||||
|
addl %ebx, 84(%rdi) # ctx->hash[1] += b
|
||||||
popq %r14 #
|
popq %r14 #
|
||||||
|
addl %ecx, 88(%rdi) # ctx->hash[2] += c
|
||||||
popq %r15 #
|
popq %r15 #
|
||||||
|
addl %edx, 92(%rdi) # ctx->hash[3] += d
|
||||||
|
popq %rbx #
|
||||||
|
addl %ebp, 96(%rdi) # ctx->hash[4] += e
|
||||||
|
popq %rbp #
|
||||||
|
|
||||||
ret
|
ret
|
||||||
.size sha1_process_block64, .-sha1_process_block64
|
.size sha1_process_block64, .-sha1_process_block64
|
||||||
|
@ -15,14 +15,14 @@ echo \
|
|||||||
.hidden sha1_process_block64
|
.hidden sha1_process_block64
|
||||||
.type sha1_process_block64, @function
|
.type sha1_process_block64, @function
|
||||||
|
|
||||||
.balign 8 # allow decoders to fetch at least 4 first insns
|
.balign 8 # allow decoders to fetch at least 5 first insns
|
||||||
sha1_process_block64:
|
sha1_process_block64:
|
||||||
pushq %r15 #
|
pushq %rbp # 1 byte insn
|
||||||
pushq %r14 #
|
pushq %rbx # 1 byte insn
|
||||||
pushq %r13 #
|
pushq %r15 # 2 byte insn
|
||||||
pushq %r12 #
|
pushq %r14 # 2 byte insn
|
||||||
pushq %rbp #
|
pushq %r13 # 2 byte insn
|
||||||
pushq %rbx #
|
pushq %r12 # 2 byte insn
|
||||||
pushq %rdi # we need ctx at the end
|
pushq %rdi # we need ctx at the end
|
||||||
|
|
||||||
#Register and stack use:
|
#Register and stack use:
|
||||||
@ -31,24 +31,6 @@ sha1_process_block64:
|
|||||||
# esi,edi: temps
|
# esi,edi: temps
|
||||||
# -32+4*n(%rsp),r8...r15: W[0..7,8..15]
|
# -32+4*n(%rsp),r8...r15: W[0..7,8..15]
|
||||||
# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?)
|
# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?)
|
||||||
|
|
||||||
movq 4*8(%rdi), %r8
|
|
||||||
bswapq %r8
|
|
||||||
movl %r8d, %r9d
|
|
||||||
shrq $32, %r8
|
|
||||||
movq 4*10(%rdi), %r10
|
|
||||||
bswapq %r10
|
|
||||||
movl %r10d, %r11d
|
|
||||||
shrq $32, %r10
|
|
||||||
movq 4*12(%rdi), %r12
|
|
||||||
bswapq %r12
|
|
||||||
movl %r12d, %r13d
|
|
||||||
shrq $32, %r12
|
|
||||||
movq 4*14(%rdi), %r14
|
|
||||||
bswapq %r14
|
|
||||||
movl %r14d, %r15d
|
|
||||||
shrq $32, %r14
|
|
||||||
|
|
||||||
movl $3, %eax
|
movl $3, %eax
|
||||||
1:
|
1:
|
||||||
movq (%rdi,%rax,8), %rsi
|
movq (%rdi,%rax,8), %rsi
|
||||||
@ -57,11 +39,29 @@ sha1_process_block64:
|
|||||||
movq %rsi, -32(%rsp,%rax,8)
|
movq %rsi, -32(%rsp,%rax,8)
|
||||||
decl %eax
|
decl %eax
|
||||||
jns 1b
|
jns 1b
|
||||||
|
|
||||||
movl 80(%rdi), %eax # a = ctx->hash[0]
|
movl 80(%rdi), %eax # a = ctx->hash[0]
|
||||||
movl 84(%rdi), %ebx # b = ctx->hash[1]
|
movl 84(%rdi), %ebx # b = ctx->hash[1]
|
||||||
movl 88(%rdi), %ecx # c = ctx->hash[2]
|
movl 88(%rdi), %ecx # c = ctx->hash[2]
|
||||||
movl 92(%rdi), %edx # d = ctx->hash[3]
|
movl 92(%rdi), %edx # d = ctx->hash[3]
|
||||||
movl 96(%rdi), %ebp # e = ctx->hash[4]
|
movl 96(%rdi), %ebp # e = ctx->hash[4]
|
||||||
|
|
||||||
|
movq 4*8(%rdi), %r8
|
||||||
|
movq 4*10(%rdi), %r10
|
||||||
|
bswapq %r8
|
||||||
|
bswapq %r10
|
||||||
|
movq 4*12(%rdi), %r12
|
||||||
|
movq 4*14(%rdi), %r14
|
||||||
|
bswapq %r12
|
||||||
|
bswapq %r14
|
||||||
|
movl %r8d, %r9d
|
||||||
|
shrq $32, %r8
|
||||||
|
movl %r10d, %r11d
|
||||||
|
shrq $32, %r10
|
||||||
|
movl %r12d, %r13d
|
||||||
|
shrq $32, %r12
|
||||||
|
movl %r14d, %r15d
|
||||||
|
shrq $32, %r14
|
||||||
'
|
'
|
||||||
W32() {
|
W32() {
|
||||||
test "$1" || exit 1
|
test "$1" || exit 1
|
||||||
@ -71,6 +71,13 @@ test "$1" -lt 8 && echo "-32+4*$1(%rsp)"
|
|||||||
test "$1" -ge 8 && echo "%r${1}d"
|
test "$1" -ge 8 && echo "%r${1}d"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# It's possible to interleave insns in rounds to mostly eliminate
|
||||||
|
# dependency chains, but this likely to only help old Pentium-based
|
||||||
|
# CPUs (ones without OOO, which can only simultaneously execute a pair
|
||||||
|
# of _adjacent_ insns).
|
||||||
|
# Testing on old-ish Silvermont CPU (which has OOO window of only
|
||||||
|
# about ~8 insns) shows very small (~1%) speedup.
|
||||||
|
|
||||||
RD1A() {
|
RD1A() {
|
||||||
local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
|
local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
|
||||||
local n=$(($6))
|
local n=$(($6))
|
||||||
@ -257,17 +264,17 @@ RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx b
|
|||||||
|
|
||||||
echo "
|
echo "
|
||||||
popq %rdi #
|
popq %rdi #
|
||||||
addl %eax, 80(%rdi) # ctx->hash[0] += a
|
|
||||||
addl %ebx, 84(%rdi) # ctx->hash[1] += b
|
|
||||||
addl %ecx, 88(%rdi) # ctx->hash[2] += c
|
|
||||||
addl %edx, 92(%rdi) # ctx->hash[3] += d
|
|
||||||
addl %ebp, 96(%rdi) # ctx->hash[4] += e
|
|
||||||
popq %rbx #
|
|
||||||
popq %rbp #
|
|
||||||
popq %r12 #
|
popq %r12 #
|
||||||
|
addl %eax, 80(%rdi) # ctx->hash[0] += a
|
||||||
popq %r13 #
|
popq %r13 #
|
||||||
|
addl %ebx, 84(%rdi) # ctx->hash[1] += b
|
||||||
popq %r14 #
|
popq %r14 #
|
||||||
|
addl %ecx, 88(%rdi) # ctx->hash[2] += c
|
||||||
popq %r15 #
|
popq %r15 #
|
||||||
|
addl %edx, 92(%rdi) # ctx->hash[3] += d
|
||||||
|
popq %rbx #
|
||||||
|
addl %ebp, 96(%rdi) # ctx->hash[4] += e
|
||||||
|
popq %rbp #
|
||||||
|
|
||||||
ret
|
ret
|
||||||
.size sha1_process_block64, .-sha1_process_block64
|
.size sha1_process_block64, .-sha1_process_block64
|
||||||
|
Loading…
Reference in New Issue
Block a user