diff options
author | Denys Vlasenko | 2022-01-04 01:45:13 +0100 |
---|---|---|
committer | Denys Vlasenko | 2022-01-04 01:45:52 +0100 |
commit | c3cfcc92422f6e525073226cdbfdcb00ab1e7dc7 (patch) | |
tree | fc2c1bd26b585b8da0ba8cbe21b9b9ab745ef42c /libbb | |
parent | 1fc520ed286f815cae1da1e9f8014cb18a256744 (diff) | |
download | busybox-c3cfcc92422f6e525073226cdbfdcb00ab1e7dc7.zip busybox-c3cfcc92422f6e525073226cdbfdcb00ab1e7dc7.tar.gz |
libbb/sha1: x86_64 version: reorder prologue/epilogue insns
Not clear exactly why, but this increases hashing speed
on Skylake from 454 MB/s to 464 MB/s.
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Diffstat (limited to 'libbb')
-rw-r--r-- | libbb/hash_md5_sha_x86-64.S | 60 | ||||
-rwxr-xr-x | libbb/hash_md5_sha_x86-64.S.sh | 67 |
2 files changed, 67 insertions, 60 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S index 95b85d8..ff78fc0 100644 --- a/libbb/hash_md5_sha_x86-64.S +++ b/libbb/hash_md5_sha_x86-64.S @@ -6,14 +6,14 @@ .hidden sha1_process_block64 .type sha1_process_block64, @function - .balign 8 # allow decoders to fetch at least 4 first insns + .balign 8 # allow decoders to fetch at least 5 first insns sha1_process_block64: - pushq %r15 # - pushq %r14 # - pushq %r13 # - pushq %r12 # - pushq %rbp # - pushq %rbx # + pushq %rbp # 1 byte insn + pushq %rbx # 1 byte insn + pushq %r15 # 2 byte insn + pushq %r14 # 2 byte insn + pushq %r13 # 2 byte insn + pushq %r12 # 2 byte insn pushq %rdi # we need ctx at the end #Register and stack use: @@ -22,24 +22,6 @@ sha1_process_block64: # esi,edi: temps # -32+4*n(%rsp),r8...r15: W[0..7,8..15] # (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) - - movq 4*8(%rdi), %r8 - bswapq %r8 - movl %r8d, %r9d - shrq $32, %r8 - movq 4*10(%rdi), %r10 - bswapq %r10 - movl %r10d, %r11d - shrq $32, %r10 - movq 4*12(%rdi), %r12 - bswapq %r12 - movl %r12d, %r13d - shrq $32, %r12 - movq 4*14(%rdi), %r14 - bswapq %r14 - movl %r14d, %r15d - shrq $32, %r14 - movl $3, %eax 1: movq (%rdi,%rax,8), %rsi @@ -48,12 +30,30 @@ sha1_process_block64: movq %rsi, -32(%rsp,%rax,8) decl %eax jns 1b + movl 80(%rdi), %eax # a = ctx->hash[0] movl 84(%rdi), %ebx # b = ctx->hash[1] movl 88(%rdi), %ecx # c = ctx->hash[2] movl 92(%rdi), %edx # d = ctx->hash[3] movl 96(%rdi), %ebp # e = ctx->hash[4] + movq 4*8(%rdi), %r8 + movq 4*10(%rdi), %r10 + bswapq %r8 + bswapq %r10 + movq 4*12(%rdi), %r12 + movq 4*14(%rdi), %r14 + bswapq %r12 + bswapq %r14 + movl %r8d, %r9d + shrq $32, %r8 + movl %r10d, %r11d + shrq $32, %r10 + movl %r12d, %r13d + shrq $32, %r12 + movl %r14d, %r15d + shrq $32, %r14 + # 0 # W[0], already in %esi movl %ecx, %edi # c @@ -1272,17 +1272,17 @@ sha1_process_block64: rorl $2, %ecx # b = rotl32(b,30) popq %rdi # + popq %r12 # addl %eax, 80(%rdi) # ctx->hash[0] += a + popq %r13 # addl %ebx, 84(%rdi) # ctx->hash[1] += b + popq %r14 # addl %ecx, 88(%rdi) # ctx->hash[2] += c + popq %r15 # addl %edx, 92(%rdi) # ctx->hash[3] += d - addl %ebp, 96(%rdi) # ctx->hash[4] += e popq %rbx # + addl %ebp, 96(%rdi) # ctx->hash[4] += e popq %rbp # - popq %r12 # - popq %r13 # - popq %r14 # - popq %r15 # ret .size sha1_process_block64, .-sha1_process_block64 diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index c5f0ef5..7e50b64 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh @@ -15,14 +15,14 @@ echo \ .hidden sha1_process_block64 .type sha1_process_block64, @function - .balign 8 # allow decoders to fetch at least 4 first insns + .balign 8 # allow decoders to fetch at least 5 first insns sha1_process_block64: - pushq %r15 # - pushq %r14 # - pushq %r13 # - pushq %r12 # - pushq %rbp # - pushq %rbx # + pushq %rbp # 1 byte insn + pushq %rbx # 1 byte insn + pushq %r15 # 2 byte insn + pushq %r14 # 2 byte insn + pushq %r13 # 2 byte insn + pushq %r12 # 2 byte insn pushq %rdi # we need ctx at the end #Register and stack use: @@ -31,24 +31,6 @@ sha1_process_block64: # esi,edi: temps # -32+4*n(%rsp),r8...r15: W[0..7,8..15] # (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) - - movq 4*8(%rdi), %r8 - bswapq %r8 - movl %r8d, %r9d - shrq $32, %r8 - movq 4*10(%rdi), %r10 - bswapq %r10 - movl %r10d, %r11d - shrq $32, %r10 - movq 4*12(%rdi), %r12 - bswapq %r12 - movl %r12d, %r13d - shrq $32, %r12 - movq 4*14(%rdi), %r14 - bswapq %r14 - movl %r14d, %r15d - shrq $32, %r14 - movl $3, %eax 1: movq (%rdi,%rax,8), %rsi @@ -57,11 +39,29 @@ sha1_process_block64: movq %rsi, -32(%rsp,%rax,8) decl %eax jns 1b + movl 80(%rdi), %eax # a = ctx->hash[0] movl 84(%rdi), %ebx # b = ctx->hash[1] movl 88(%rdi), %ecx # c = ctx->hash[2] movl 92(%rdi), %edx # d = ctx->hash[3] movl 96(%rdi), %ebp # e = ctx->hash[4] + + movq 4*8(%rdi), %r8 + movq 4*10(%rdi), %r10 + bswapq %r8 + bswapq %r10 + movq 4*12(%rdi), %r12 + movq 4*14(%rdi), %r14 + bswapq %r12 + bswapq %r14 + movl %r8d, %r9d + shrq $32, %r8 + movl %r10d, %r11d + shrq $32, %r10 + movl %r12d, %r13d + shrq $32, %r12 + movl %r14d, %r15d + shrq $32, %r14 ' W32() { test "$1" || exit 1 @@ -71,6 +71,13 @@ test "$1" -lt 8 && echo "-32+4*$1(%rsp)" test "$1" -ge 8 && echo "%r${1}d" } +# It's possible to interleave insns in rounds to mostly eliminate +# dependency chains, but this likely to only help old Pentium-based +# CPUs (ones without OOO, which can only simultaneously execute a pair +# of _adjacent_ insns). +# Testing on old-ish Silvermont CPU (which has OOO window of only +# about ~8 insns) shows very small (~1%) speedup. + RD1A() { local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 local n=$(($6)) @@ -257,17 +264,17 @@ RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx b echo " popq %rdi # + popq %r12 # addl %eax, 80(%rdi) # ctx->hash[0] += a + popq %r13 # addl %ebx, 84(%rdi) # ctx->hash[1] += b + popq %r14 # addl %ecx, 88(%rdi) # ctx->hash[2] += c + popq %r15 # addl %edx, 92(%rdi) # ctx->hash[3] += d - addl %ebp, 96(%rdi) # ctx->hash[4] += e popq %rbx # + addl %ebp, 96(%rdi) # ctx->hash[4] += e popq %rbp # - popq %r12 # - popq %r13 # - popq %r14 # - popq %r15 # ret .size sha1_process_block64, .-sha1_process_block64 |