diff options
Diffstat (limited to 'libbb/hash_md5_sha_x86-64.S.sh')
-rwxr-xr-x | libbb/hash_md5_sha_x86-64.S.sh | 267 |
1 files changed, 267 insertions, 0 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh new file mode 100755 index 0000000..931c0f0 --- /dev/null +++ b/libbb/hash_md5_sha_x86-64.S.sh @@ -0,0 +1,267 @@ +#!/bin/sh + +# We don't regenerate it on every "make" invocation - only by hand. +# The reason is that the changes to generated code are difficult +# to visualize by looking only at this script, it helps when the commit +# also contains the diff of the generated file. +exec >hash_md5_sha_x86-64.S + +echo \ +'### Generated by hash_md5_sha_x86-64.S.sh ### + +#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) + .section .text.sha1_process_block64,"ax",@progbits + .globl sha1_process_block64 + .hidden sha1_process_block64 + .type sha1_process_block64, @function + + .balign 8 # allow decoders to fetch at least 4 first insns +sha1_process_block64: + pushq %r15 # + pushq %r14 # + pushq %r13 # + pushq %r12 # + pushq %rbp # + pushq %rbx # + pushq %rdi # we need ctx at the end + +#Register and stack use: +# eax..edx: a..d +# ebp: e +# esi,edi: temps +# -32+4*n(%rsp),r8...r15: W[0..7,8..15] +# (TODO: actually W[0..7] are used a bit more often, put _thme_ into r8..r15?) + + movq 4*8(%rdi), %r8 + bswapq %r8 + movl %r8d, %r9d + shrq $32, %r8 + movq 4*10(%rdi), %r10 + bswapq %r10 + movl %r10d, %r11d + shrq $32, %r10 + movq 4*12(%rdi), %r12 + bswapq %r12 + movl %r12d, %r13d + shrq $32, %r12 + movq 4*14(%rdi), %r14 + bswapq %r14 + movl %r14d, %r15d + shrq $32, %r14 + + movl $3, %eax +1: + movq (%rdi,%rax,8), %rsi + bswapq %rsi + rolq $32, %rsi + movq %rsi, -32(%rsp,%rax,8) + decl %eax + jns 1b + movl 80(%rdi), %eax # a = ctx->hash[0] + movl 84(%rdi), %ebx # b = ctx->hash[1] + movl 88(%rdi), %ecx # c = ctx->hash[2] + movl 92(%rdi), %edx # d = ctx->hash[3] + movl 96(%rdi), %ebp # e = ctx->hash[4] +' +W32() { +test "$1" || exit 1 +test "$1" -lt 0 && exit 1 +test "$1" -gt 15 && exit 1 +test "$1" -lt 8 && echo "-32+4*$1(%rsp)" +test "$1" -ge 8 && echo "%r${1}d" +} + +RD1A() { +local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 +local n=$(($6)) +echo "# $n" +test $n = 0 && echo " + # W[0], already in %esi +";test $n != 0 && test $n -lt 8 && echo " + movl `W32 $n`, %esi # W[n] +";test $n -ge 8 && echo " + # W[n], in %r$n +";echo " + movl %e$c, %edi # c + xorl %e$d, %edi # ^d + andl %e$b, %edi # &b + xorl %e$d, %edi # (((c ^ d) & b) ^ d) +";test $n -lt 8 && echo " + leal $RCONST(%r$e,%rsi),%e$e # e += RCONST + W[n] +";test $n -ge 8 && echo " + leal $RCONST(%r$e,%r$n),%e$e # e += RCONST + W[n] +";echo " + addl %edi, %e$e # e += (((c ^ d) & b) ^ d) + movl %e$a, %esi # + roll \$5, %esi # rotl32(a,5) + addl %esi, %e$e # e += rotl32(a,5) + rorl \$2, %e$b # b = rotl32(b,30) +" +} +RD1B() { +local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 +local n=$(($6)) +local n13=$(((n+13) & 15)) +local n8=$(((n+8) & 15)) +local n2=$(((n+2) & 15)) +local n0=$(((n+0) & 15)) +echo " +# $n +";test $n0 -lt 8 && echo " + movl `W32 $n13`, %esi # W[(n+13) & 15] + xorl `W32 $n8`, %esi # ^W[(n+8) & 15] + xorl `W32 $n2`, %esi # ^W[(n+2) & 15] + xorl `W32 $n0`, %esi # ^W[n & 15] + roll %esi # + movl %esi, `W32 $n0` # store to W[n & 15] +";test $n0 -ge 8 && echo " + xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15] + xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] + xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] + roll `W32 $n0` # +"; echo " + movl %e$c, %edi # c + xorl %e$d, %edi # ^d + andl %e$b, %edi # &b + xorl %e$d, %edi # (((c ^ d) & b) ^ d) +";test $n0 -lt 8 && echo " + leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] +";test $n0 -ge 8 && echo " + leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15] +";echo " + addl %edi, %e$e # e += (((c ^ d) & b) ^ d) + movl %e$a, %esi # + roll \$5, %esi # rotl32(a,5) + addl %esi, %e$e # e += rotl32(a,5) + rorl \$2, %e$b # b = rotl32(b,30) +" +} +{ +RCONST=0x5A827999 +RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3; RD1A bx cx dx bp ax 4 +RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7; RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9 +RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11; RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14 +RD1A ax bx cx dx bp 15; RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19 +} | grep -v '^$' + +RD2() { +local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 +local n=$(($6)) +local n13=$(((n+13) & 15)) +local n8=$(((n+8) & 15)) +local n2=$(((n+2) & 15)) +local n0=$(((n+0) & 15)) +echo " +# $n +";test $n0 -lt 8 && echo " + movl `W32 $n13`, %esi # W[(n+13) & 15] + xorl `W32 $n8`, %esi # ^W[(n+8) & 15] + xorl `W32 $n2`, %esi # ^W[(n+2) & 15] + xorl `W32 $n0`, %esi # ^W[n & 15] + roll %esi # + movl %esi, `W32 $n0` # store to W[n & 15] +";test $n0 -ge 8 && echo " + xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15] + xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] + xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] + roll `W32 $n0` # +"; echo " + movl %e$c, %edi # c + xorl %e$d, %edi # ^d + xorl %e$b, %edi # ^b +";test $n0 -lt 8 && echo " + leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] +";test $n0 -ge 8 && echo " + leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15] +";echo " + addl %edi, %e$e # e += (c ^ d ^ b) + movl %e$a, %esi # + roll \$5, %esi # rotl32(a,5) + addl %esi, %e$e # e += rotl32(a,5) + rorl \$2, %e$b # b = rotl32(b,30) +" +} +{ +RCONST=0x6ED9EBA1 +RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23; RD2 bx cx dx bp ax 24 +RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27; RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29 +RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31; RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34 +RD2 ax bx cx dx bp 35; RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39 +} | grep -v '^$' + +RD3() { +local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 +local n=$(($6)) +local n13=$(((n+13) & 15)) +local n8=$(((n+8) & 15)) +local n2=$(((n+2) & 15)) +local n0=$(((n+0) & 15)) +echo " +# $n + movl %e$b, %edi # di: b + movl %e$b, %esi # si: b + orl %e$c, %edi # di: b | c + andl %e$c, %esi # si: b & c + andl %e$d, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) +";test $n0 -lt 8 && echo " + movl `W32 $n13`, %esi # W[(n+13) & 15] + xorl `W32 $n8`, %esi # ^W[(n+8) & 15] + xorl `W32 $n2`, %esi # ^W[(n+2) & 15] + xorl `W32 $n0`, %esi # ^W[n & 15] + roll %esi # + movl %esi, `W32 $n0` # store to W[n & 15] +";test $n0 -ge 8 && echo " + xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15] + xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] + xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] + roll `W32 $n0` # +"; echo " + addl %edi, %e$e # += ((b | c) & d) | (b & c) +";test $n0 -lt 8 && echo " + leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] +";test $n0 -ge 8 && echo " + leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15] +";echo " + movl %e$a, %esi # + roll \$5, %esi # rotl32(a,5) + addl %esi, %e$e # e += rotl32(a,5) + rorl \$2, %e$b # b = rotl32(b,30) +" +} +{ +#RCONST=0x8F1BBCDC "out of range for signed 32bit displacement" +RCONST=-0x70E44324 +RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43; RD3 bx cx dx bp ax 44 +RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47; RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49 +RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51; RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54 +RD3 ax bx cx dx bp 55; RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59 +} | grep -v '^$' + +# Round 4 has the same logic as round 2, only n and RCONST are different +{ +#RCONST=0xCA62C1D6 "out of range for signed 32bit displacement" +RCONST=-0x359D3E2A +RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63; RD2 bx cx dx bp ax 64 +RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67; RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69 +RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71; RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74 +RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79 +} | grep -v '^$' + +echo " + popq %rdi # + addl %eax, 80(%rdi) # ctx->hash[0] += a + addl %ebx, 84(%rdi) # ctx->hash[1] += b + addl %ecx, 88(%rdi) # ctx->hash[2] += c + addl %edx, 92(%rdi) # ctx->hash[3] += d + addl %ebp, 96(%rdi) # ctx->hash[4] += e + popq %rbx # + popq %rbp # + popq %r12 # + popq %r13 # + popq %r14 # + popq %r15 # + + ret + .size sha1_process_block64, .-sha1_process_block64 +#endif" |