libbb/sha1: x86_64 version: reorder prologue/epilogue insns

Not clear exactly why, but this increases hashing speed on Skylake from 454 MB/s to 464 MB/s. Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
2022-01-04 01:45:13 +01:00
parent 1fc520ed28
commit c3cfcc9242
2 changed files with 71 additions and 64 deletions
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -6,14 +6,14 @@
 	.hidden sha1_process_block64
 	.type	sha1_process_block64, @function

-	.balign	8	# allow decoders to fetch at least 4 first insns
+	.balign	8	# allow decoders to fetch at least 5 first insns
 sha1_process_block64:
-	pushq	%r15	#
-	pushq	%r14	#
-	pushq	%r13	#
-	pushq	%r12	#
-	pushq	%rbp	#
-	pushq	%rbx	#
+	pushq	%rbp	# 1 byte insn
+	pushq	%rbx	# 1 byte insn
+	pushq	%r15	# 2 byte insn
+	pushq	%r14	# 2 byte insn
+	pushq	%r13	# 2 byte insn
+	pushq	%r12	# 2 byte insn
 	pushq	%rdi	# we need ctx at the end

 #Register and stack use:
@@ -22,24 +22,6 @@ sha1_process_block64:
 # esi,edi: temps
 # -32+4*n(%rsp),r8...r15: W[0..7,8..15]
 # (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?)
-
-	movq	4*8(%rdi), %r8
-	bswapq	%r8
-	movl	%r8d, %r9d
-	shrq	$32, %r8
-	movq	4*10(%rdi), %r10
-	bswapq	%r10
-	movl	%r10d, %r11d
-	shrq	$32, %r10
-	movq	4*12(%rdi), %r12
-	bswapq	%r12
-	movl	%r12d, %r13d
-	shrq	$32, %r12
-	movq	4*14(%rdi), %r14
-	bswapq	%r14
-	movl	%r14d, %r15d
-	shrq	$32, %r14
-
 	movl	$3, %eax
 1:
 	movq	(%rdi,%rax,8), %rsi
@@ -48,12 +30,30 @@ sha1_process_block64:
 	movq	%rsi, -32(%rsp,%rax,8)
 	decl	%eax
 	jns	1b
+
 	movl	80(%rdi), %eax		# a = ctx->hash[0]
 	movl	84(%rdi), %ebx		# b = ctx->hash[1]
 	movl	88(%rdi), %ecx		# c = ctx->hash[2]
 	movl	92(%rdi), %edx		# d = ctx->hash[3]
 	movl	96(%rdi), %ebp		# e = ctx->hash[4]

+	movq	4*8(%rdi), %r8
+	movq	4*10(%rdi), %r10
+	bswapq	%r8
+	bswapq	%r10
+	movq	4*12(%rdi), %r12
+	movq	4*14(%rdi), %r14
+	bswapq	%r12
+	bswapq	%r14
+	movl	%r8d, %r9d
+	shrq	$32, %r8
+	movl	%r10d, %r11d
+	shrq	$32, %r10
+	movl	%r12d, %r13d
+	shrq	$32, %r12
+	movl	%r14d, %r15d
+	shrq	$32, %r14
+
 # 0
 	# W[0], already in %esi
 	movl	%ecx, %edi		# c
@@ -1272,17 +1272,17 @@ sha1_process_block64:
 	rorl	$2, %ecx		# b = rotl32(b,30)

 	popq	%rdi		#
-	addl	%eax, 80(%rdi)  # ctx->hash[0] += a
-	addl	%ebx, 84(%rdi)  # ctx->hash[1] += b
-	addl	%ecx, 88(%rdi)  # ctx->hash[2] += c
-	addl	%edx, 92(%rdi)  # ctx->hash[3] += d
-	addl	%ebp, 96(%rdi)  # ctx->hash[4] += e
-	popq	%rbx		#
-	popq	%rbp		#
 	popq	%r12		#
+	addl	%eax, 80(%rdi)  # ctx->hash[0] += a
 	popq	%r13		#
+	addl	%ebx, 84(%rdi)  # ctx->hash[1] += b
 	popq	%r14		#
+	addl	%ecx, 88(%rdi)  # ctx->hash[2] += c
 	popq	%r15		#
+	addl	%edx, 92(%rdi)  # ctx->hash[3] += d
+	popq	%rbx		#
+	addl	%ebp, 96(%rdi)  # ctx->hash[4] += e
+	popq	%rbp		#

 	ret
 	.size	sha1_process_block64, .-sha1_process_block64
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -15,14 +15,14 @@ echo \
 	.hidden sha1_process_block64
 	.type	sha1_process_block64, @function

-	.balign	8	# allow decoders to fetch at least 4 first insns
+	.balign	8	# allow decoders to fetch at least 5 first insns
 sha1_process_block64:
-	pushq	%r15	#
-	pushq	%r14	#
-	pushq	%r13	#
-	pushq	%r12	#
-	pushq	%rbp	#
-	pushq	%rbx	#
+	pushq	%rbp	# 1 byte insn
+	pushq	%rbx	# 1 byte insn
+	pushq	%r15	# 2 byte insn
+	pushq	%r14	# 2 byte insn
+	pushq	%r13	# 2 byte insn
+	pushq	%r12	# 2 byte insn
 	pushq	%rdi	# we need ctx at the end

 #Register and stack use:
@@ -31,24 +31,6 @@ sha1_process_block64:
 # esi,edi: temps
 # -32+4*n(%rsp),r8...r15: W[0..7,8..15]
 # (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?)
-
-	movq	4*8(%rdi), %r8
-	bswapq	%r8
-	movl	%r8d, %r9d
-	shrq	$32, %r8
-	movq	4*10(%rdi), %r10
-	bswapq	%r10
-	movl	%r10d, %r11d
-	shrq	$32, %r10
-	movq	4*12(%rdi), %r12
-	bswapq	%r12
-	movl	%r12d, %r13d
-	shrq	$32, %r12
-	movq	4*14(%rdi), %r14
-	bswapq	%r14
-	movl	%r14d, %r15d
-	shrq	$32, %r14
-
 	movl	$3, %eax
 1:
 	movq	(%rdi,%rax,8), %rsi
@@ -57,11 +39,29 @@ sha1_process_block64:
 	movq	%rsi, -32(%rsp,%rax,8)
 	decl	%eax
 	jns	1b
+
 	movl	80(%rdi), %eax		# a = ctx->hash[0]
 	movl	84(%rdi), %ebx		# b = ctx->hash[1]
 	movl	88(%rdi), %ecx		# c = ctx->hash[2]
 	movl	92(%rdi), %edx		# d = ctx->hash[3]
 	movl	96(%rdi), %ebp		# e = ctx->hash[4]
+
+	movq	4*8(%rdi), %r8
+	movq	4*10(%rdi), %r10
+	bswapq	%r8
+	bswapq	%r10
+	movq	4*12(%rdi), %r12
+	movq	4*14(%rdi), %r14
+	bswapq	%r12
+	bswapq	%r14
+	movl	%r8d, %r9d
+	shrq	$32, %r8
+	movl	%r10d, %r11d
+	shrq	$32, %r10
+	movl	%r12d, %r13d
+	shrq	$32, %r12
+	movl	%r14d, %r15d
+	shrq	$32, %r14
 '
 W32() {
 test "$1" || exit 1
@@ -71,6 +71,13 @@ test "$1" -lt 8 && echo "-32+4*$1(%rsp)"
 test "$1" -ge 8 && echo "%r${1}d"
 }

+# It's possible to interleave insns in rounds to mostly eliminate
+# dependency chains, but this likely to only help old Pentium-based
+# CPUs (ones without OOO, which can only simultaneously execute a pair
+# of _adjacent_ insns).
+# Testing on old-ish Silvermont CPU (which has OOO window of only
+# about ~8 insns) shows very small (~1%) speedup.
+
 RD1A() {
 local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
 local n=$(($6))
@@ -257,17 +264,17 @@ RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx b

 echo "
 	popq	%rdi		#
-	addl	%eax, 80(%rdi)  # ctx->hash[0] += a
-	addl	%ebx, 84(%rdi)  # ctx->hash[1] += b
-	addl	%ecx, 88(%rdi)  # ctx->hash[2] += c
-	addl	%edx, 92(%rdi)  # ctx->hash[3] += d
-	addl	%ebp, 96(%rdi)  # ctx->hash[4] += e
-	popq	%rbx		#
-	popq	%rbp		#
 	popq	%r12		#
+	addl	%eax, 80(%rdi)  # ctx->hash[0] += a
 	popq	%r13		#
+	addl	%ebx, 84(%rdi)  # ctx->hash[1] += b
 	popq	%r14		#
+	addl	%ecx, 88(%rdi)  # ctx->hash[2] += c
 	popq	%r15		#
+	addl	%edx, 92(%rdi)  # ctx->hash[3] += d
+	popq	%rbx		#
+	addl	%ebp, 96(%rdi)  # ctx->hash[4] += e
+	popq	%rbp		#

 	ret
 	.size	sha1_process_block64, .-sha1_process_block64