diff options
author | Denys Vlasenko | 2022-02-07 02:06:18 +0100 |
---|---|---|
committer | Denys Vlasenko | 2022-02-07 02:34:04 +0100 |
commit | c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb (patch) | |
tree | b75c366622b3146a4fdd3f7739b6eaf9d3bc1ac9 /libbb/hash_md5_sha_x86-64.S.sh | |
parent | 987be932ed3cbea56b68bbe85649191c13b66015 (diff) | |
download | busybox-c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb.zip busybox-c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb.tar.gz |
libbb/sha1: shrink and speed up unrolled x86-64 code
function old new delta
sha1_process_block64 3514 3482 -32
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Diffstat (limited to 'libbb/hash_md5_sha_x86-64.S.sh')
-rwxr-xr-x | libbb/hash_md5_sha_x86-64.S.sh | 9 |
1 files changed, 7 insertions, 2 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index 656fb54..fb1e4b5 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh @@ -203,8 +203,13 @@ echo "# PREP $@ movaps $xmmW12, $xmmT1 psrldq \$4, $xmmT1 # rshift by 4 bytes: T1 = ([13],[14],[15],0) - pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) - punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps $xmmW0, $xmmT2 + shufps \$0x4e, $xmmW4, $xmmT2 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) xorps $xmmW8, $xmmW0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) xorps $xmmT1, $xmmT2 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) |