From c3cfcc92422f6e525073226cdbfdcb00ab1e7dc7 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Tue, 4 Jan 2022 01:45:13 +0100
Subject: libbb/sha1: x86_64 version: reorder prologue/epilogue insns

Not clear exactly why, but this increases hashing speed
on Skylake from 454 MB/s to 464 MB/s.

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha_x86-64.S | 60 ++++++++++++++++++++++-----------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

(limited to 'libbb/hash_md5_sha_x86-64.S')

diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index 95b85d8..ff78fc0 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -6,14 +6,14 @@
 	.hidden sha1_process_block64
 	.type	sha1_process_block64, @function
 
-	.balign	8	# allow decoders to fetch at least 4 first insns
+	.balign	8	# allow decoders to fetch at least 5 first insns
 sha1_process_block64:
-	pushq	%r15	#
-	pushq	%r14	#
-	pushq	%r13	#
-	pushq	%r12	#
-	pushq	%rbp	#
-	pushq	%rbx	#
+	pushq	%rbp	# 1 byte insn
+	pushq	%rbx	# 1 byte insn
+	pushq	%r15	# 2 byte insn
+	pushq	%r14	# 2 byte insn
+	pushq	%r13	# 2 byte insn
+	pushq	%r12	# 2 byte insn
 	pushq	%rdi	# we need ctx at the end
 
 #Register and stack use:
@@ -22,24 +22,6 @@ sha1_process_block64:
 # esi,edi: temps
 # -32+4*n(%rsp),r8...r15: W[0..7,8..15]
 # (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?)
-
-	movq	4*8(%rdi), %r8
-	bswapq	%r8
-	movl	%r8d, %r9d
-	shrq	$32, %r8
-	movq	4*10(%rdi), %r10
-	bswapq	%r10
-	movl	%r10d, %r11d
-	shrq	$32, %r10
-	movq	4*12(%rdi), %r12
-	bswapq	%r12
-	movl	%r12d, %r13d
-	shrq	$32, %r12
-	movq	4*14(%rdi), %r14
-	bswapq	%r14
-	movl	%r14d, %r15d
-	shrq	$32, %r14
-
 	movl	$3, %eax
 1:
 	movq	(%rdi,%rax,8), %rsi
@@ -48,12 +30,30 @@ sha1_process_block64:
 	movq	%rsi, -32(%rsp,%rax,8)
 	decl	%eax
 	jns	1b
+
 	movl	80(%rdi), %eax		# a = ctx->hash[0]
 	movl	84(%rdi), %ebx		# b = ctx->hash[1]
 	movl	88(%rdi), %ecx		# c = ctx->hash[2]
 	movl	92(%rdi), %edx		# d = ctx->hash[3]
 	movl	96(%rdi), %ebp		# e = ctx->hash[4]
 
+	movq	4*8(%rdi), %r8
+	movq	4*10(%rdi), %r10
+	bswapq	%r8
+	bswapq	%r10
+	movq	4*12(%rdi), %r12
+	movq	4*14(%rdi), %r14
+	bswapq	%r12
+	bswapq	%r14
+	movl	%r8d, %r9d
+	shrq	$32, %r8
+	movl	%r10d, %r11d
+	shrq	$32, %r10
+	movl	%r12d, %r13d
+	shrq	$32, %r12
+	movl	%r14d, %r15d
+	shrq	$32, %r14
+
 # 0
 	# W[0], already in %esi
 	movl	%ecx, %edi		# c
@@ -1272,17 +1272,17 @@ sha1_process_block64:
 	rorl	$2, %ecx		# b = rotl32(b,30)
 
 	popq	%rdi		#
+	popq	%r12		#
 	addl	%eax, 80(%rdi)  # ctx->hash[0] += a
+	popq	%r13		#
 	addl	%ebx, 84(%rdi)  # ctx->hash[1] += b
+	popq	%r14		#
 	addl	%ecx, 88(%rdi)  # ctx->hash[2] += c
+	popq	%r15		#
 	addl	%edx, 92(%rdi)  # ctx->hash[3] += d
-	addl	%ebp, 96(%rdi)  # ctx->hash[4] += e
 	popq	%rbx		#
+	addl	%ebp, 96(%rdi)  # ctx->hash[4] += e
 	popq	%rbp		#
-	popq	%r12		#
-	popq	%r13		#
-	popq	%r14		#
-	popq	%r15		#
 
 	ret
 	.size	sha1_process_block64, .-sha1_process_block64
-- 
cgit v1.1