summaryrefslogtreecommitdiff
path: root/libbb/hash_md5_sha_x86-64.S
diff options
context:
space:
mode:
Diffstat (limited to 'libbb/hash_md5_sha_x86-64.S')
-rw-r--r--libbb/hash_md5_sha_x86-64.S1489
1 files changed, 0 insertions, 1489 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
deleted file mode 100644
index 2cdd220..0000000
--- a/libbb/hash_md5_sha_x86-64.S
+++ /dev/null
@@ -1,1489 +0,0 @@
-### Generated by hash_md5_sha_x86-64.S.sh ###
-
-#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
-#ifdef __linux__
- .section .note.GNU-stack, "", @progbits
-#endif
- .section .text.sha1_process_block64, "ax", @progbits
- .globl sha1_process_block64
- .hidden sha1_process_block64
- .type sha1_process_block64, @function
-
- .balign 8 # allow decoders to fetch at least 5 first insns
-sha1_process_block64:
- pushq %rbp # 1 byte insn
- pushq %rbx # 1 byte insn
-# pushq %r15 # 2 byte insn
- pushq %r14 # 2 byte insn
- pushq %r13 # 2 byte insn
- pushq %r12 # 2 byte insn
- pushq %rdi # we need ctx at the end
-
-#Register and stack use:
-# eax..edx: a..d
-# ebp: e
-# esi,edi,r8..r14: temps
-# r15: unused
-# xmm0..xmm3: W[]
-# xmm4,xmm5: temps
-# xmm6: current round constant
-# xmm7: all round constants
-# -64(%rsp): area for passing RCONST + W[] from vector to integer units
-
- movl 80(%rdi), %eax # a = ctx->hash[0]
- movl 84(%rdi), %ebx # b = ctx->hash[1]
- movl 88(%rdi), %ecx # c = ctx->hash[2]
- movl 92(%rdi), %edx # d = ctx->hash[3]
- movl 96(%rdi), %ebp # e = ctx->hash[4]
-
- movaps sha1const(%rip), %xmm7
- pshufd $0x00, %xmm7, %xmm6
-
- # Load W[] to xmm0..3, byteswapping on the fly.
- #
- # For iterations 0..15, we pass W[] in rsi,r8..r14
- # for use in RD1As instead of spilling them to stack.
- # We lose parallelized addition of RCONST, but LEA
- # can do two additions at once, so it is probably a wash.
- # (We use rsi instead of rN because this makes two
- # LEAs in two first RD1As shorter by one byte).
- movq 4*0(%rdi), %rsi
- movq 4*2(%rdi), %r8
- bswapq %rsi
- bswapq %r8
- rolq $32, %rsi # rsi = W[1]:W[0]
- rolq $32, %r8 # r8 = W[3]:W[2]
- movq %rsi, %xmm0
- movq %r8, %xmm4
- punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
-# movaps %xmm0, %xmm4 # add RCONST, spill to stack
-# paddd %xmm6, %xmm4
-# movups %xmm4, -64+16*0(%rsp)
-
- movq 4*4(%rdi), %r9
- movq 4*6(%rdi), %r10
- bswapq %r9
- bswapq %r10
- rolq $32, %r9 # r9 = W[5]:W[4]
- rolq $32, %r10 # r10 = W[7]:W[6]
- movq %r9, %xmm1
- movq %r10, %xmm4
- punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
-
- movq 4*8(%rdi), %r11
- movq 4*10(%rdi), %r12
- bswapq %r11
- bswapq %r12
- rolq $32, %r11 # r11 = W[9]:W[8]
- rolq $32, %r12 # r12 = W[11]:W[10]
- movq %r11, %xmm2
- movq %r12, %xmm4
- punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
-
- movq 4*12(%rdi), %r13
- movq 4*14(%rdi), %r14
- bswapq %r13
- bswapq %r14
- rolq $32, %r13 # r13 = W[13]:W[12]
- rolq $32, %r14 # r14 = W[15]:W[14]
- movq %r13, %xmm3
- movq %r14, %xmm4
- punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
-
-# 0
- leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
- shrq $32, %rsi
- movl %ecx, %edi # c
- xorl %edx, %edi # ^d
- andl %ebx, %edi # &b
- xorl %edx, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
- movl %eax, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
-# 1
- leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
- movl %ebx, %edi # c
- xorl %ecx, %edi # ^d
- andl %eax, %edi # &b
- xorl %ecx, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %edx # e += (((c ^ d) & b) ^ d)
- movl %ebp, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
-# 2
- leal 0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n]
- shrq $32, %r8
- movl %eax, %edi # c
- xorl %ebx, %edi # ^d
- andl %ebp, %edi # &b
- xorl %ebx, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
- movl %edx, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
-# 3
- leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
- movl %ebp, %edi # c
- xorl %eax, %edi # ^d
- andl %edx, %edi # &b
- xorl %eax, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
- movl %ecx, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
-# 4
- leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
- shrq $32, %r9
- movl %edx, %edi # c
- xorl %ebp, %edi # ^d
- andl %ecx, %edi # &b
- xorl %ebp, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %eax # e += (((c ^ d) & b) ^ d)
- movl %ebx, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
-# 5
- leal 0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n]
- movl %ecx, %edi # c
- xorl %edx, %edi # ^d
- andl %ebx, %edi # &b
- xorl %edx, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
- movl %eax, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
-# 6
- leal 0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n]
- shrq $32, %r10
- movl %ebx, %edi # c
- xorl %ecx, %edi # ^d
- andl %eax, %edi # &b
- xorl %ecx, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %edx # e += (((c ^ d) & b) ^ d)
- movl %ebp, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
-# 7
- leal 0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n]
- movl %eax, %edi # c
- xorl %ebx, %edi # ^d
- andl %ebp, %edi # &b
- xorl %ebx, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
- movl %edx, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
-# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
- movaps %xmm3, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm0, %xmm5
- shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm0 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm0, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm0, %xmm0 # shift left by 1
- psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
-# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm0, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*0(%rsp)
-# 8
- leal 0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n]
- shrq $32, %r11
- movl %ebp, %edi # c
- xorl %eax, %edi # ^d
- andl %edx, %edi # &b
- xorl %eax, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
- movl %ecx, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
-# 9
- leal 0x5A827999(%rax,%r11), %eax # e += RCONST + W[n]
- movl %edx, %edi # c
- xorl %ebp, %edi # ^d
- andl %ecx, %edi # &b
- xorl %ebp, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %eax # e += (((c ^ d) & b) ^ d)
- movl %ebx, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
-# 10
- leal 0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n]
- shrq $32, %r12
- movl %ecx, %edi # c
- xorl %edx, %edi # ^d
- andl %ebx, %edi # &b
- xorl %edx, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
- movl %eax, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
-# 11
- leal 0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n]
- movl %ebx, %edi # c
- xorl %ecx, %edi # ^d
- andl %eax, %edi # &b
- xorl %ecx, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %edx # e += (((c ^ d) & b) ^ d)
- movl %ebp, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
- pshufd $0x55, %xmm7, %xmm6
-# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
- movaps %xmm0, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm1, %xmm5
- shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm1 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm1, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm1, %xmm1 # shift left by 1
- psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
-# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm1, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*1(%rsp)
-# 12
- leal 0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n]
- shrq $32, %r13
- movl %eax, %edi # c
- xorl %ebx, %edi # ^d
- andl %ebp, %edi # &b
- xorl %ebx, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
- movl %edx, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
-# 13
- leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
- movl %ebp, %edi # c
- xorl %eax, %edi # ^d
- andl %edx, %edi # &b
- xorl %eax, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
- movl %ecx, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
-# 14
- leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
- shrq $32, %r14
- movl %edx, %edi # c
- xorl %ebp, %edi # ^d
- andl %ecx, %edi # &b
- xorl %ebp, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %eax # e += (((c ^ d) & b) ^ d)
- movl %ebx, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
-# 15
- leal 0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n]
- movl %ecx, %edi # c
- xorl %edx, %edi # ^d
- andl %ebx, %edi # &b
- xorl %edx, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
- movl %eax, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
-# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
- movaps %xmm1, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm2, %xmm5
- shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm2 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm2, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm2, %xmm2 # shift left by 1
- psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
-# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm2, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*2(%rsp)
-# 16
- movl %ebx, %edi # c
- xorl %ecx, %edi # ^d
- andl %eax, %edi # &b
- xorl %ecx, %edi # (((c ^ d) & b) ^ d)
- addl -64+4*0(%rsp), %edx # e += RCONST + W[n & 15]
- addl %edi, %edx # e += (((c ^ d) & b) ^ d)
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
-# 17
- movl %eax, %edi # c
- xorl %ebx, %edi # ^d
- andl %ebp, %edi # &b
- xorl %ebx, %edi # (((c ^ d) & b) ^ d)
- addl -64+4*1(%rsp), %ecx # e += RCONST + W[n & 15]
- addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
-# 18
- movl %ebp, %edi # c
- xorl %eax, %edi # ^d
- andl %edx, %edi # &b
- xorl %eax, %edi # (((c ^ d) & b) ^ d)
- addl -64+4*2(%rsp), %ebx # e += RCONST + W[n & 15]
- addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
-# 19
- movl %edx, %edi # c
- xorl %ebp, %edi # ^d
- andl %ecx, %edi # &b
- xorl %ebp, %edi # (((c ^ d) & b) ^ d)
- addl -64+4*3(%rsp), %eax # e += RCONST + W[n & 15]
- addl %edi, %eax # e += (((c ^ d) & b) ^ d)
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
-# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
- movaps %xmm2, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm3, %xmm5
- shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm3 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm3, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm3, %xmm3 # shift left by 1
- psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
-# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm3, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*3(%rsp)
-# 20
- movl %ecx, %edi # c
- xorl %edx, %edi # ^d
- xorl %ebx, %edi # ^b
- addl -64+4*4(%rsp), %ebp # e += RCONST + W[n & 15]
- addl %edi, %ebp # e += (c ^ d ^ b)
- movl %eax, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
-# 21
- movl %ebx, %edi # c
- xorl %ecx, %edi # ^d
- xorl %eax, %edi # ^b
- addl -64+4*5(%rsp), %edx # e += RCONST + W[n & 15]
- addl %edi, %edx # e += (c ^ d ^ b)
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
-# 22
- movl %eax, %edi # c
- xorl %ebx, %edi # ^d
- xorl %ebp, %edi # ^b
- addl -64+4*6(%rsp), %ecx # e += RCONST + W[n & 15]
- addl %edi, %ecx # e += (c ^ d ^ b)
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
-# 23
- movl %ebp, %edi # c
- xorl %eax, %edi # ^d
- xorl %edx, %edi # ^b
- addl -64+4*7(%rsp), %ebx # e += RCONST + W[n & 15]
- addl %edi, %ebx # e += (c ^ d ^ b)
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
-# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
- movaps %xmm3, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm0, %xmm5
- shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm0 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm0, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm0, %xmm0 # shift left by 1
- psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
-# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm0, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*0(%rsp)
-# 24
- movl %edx, %edi # c
- xorl %ebp, %edi # ^d
- xorl %ecx, %edi # ^b
- addl -64+4*8(%rsp), %eax # e += RCONST + W[n & 15]
- addl %edi, %eax # e += (c ^ d ^ b)
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
-# 25
- movl %ecx, %edi # c
- xorl %edx, %edi # ^d
- xorl %ebx, %edi # ^b
- addl -64+4*9(%rsp), %ebp # e += RCONST + W[n & 15]
- addl %edi, %ebp # e += (c ^ d ^ b)
- movl %eax, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
-# 26
- movl %ebx, %edi # c
- xorl %ecx, %edi # ^d
- xorl %eax, %edi # ^b
- addl -64+4*10(%rsp), %edx # e += RCONST + W[n & 15]
- addl %edi, %edx # e += (c ^ d ^ b)
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
-# 27
- movl %eax, %edi # c
- xorl %ebx, %edi # ^d
- xorl %ebp, %edi # ^b
- addl -64+4*11(%rsp), %ecx # e += RCONST + W[n & 15]
- addl %edi, %ecx # e += (c ^ d ^ b)
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
-# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
- movaps %xmm0, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm1, %xmm5
- shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm1 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm1, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm1, %xmm1 # shift left by 1
- psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
-# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm1, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*1(%rsp)
-# 28
- movl %ebp, %edi # c
- xorl %eax, %edi # ^d
- xorl %edx, %edi # ^b
- addl -64+4*12(%rsp), %ebx # e += RCONST + W[n & 15]
- addl %edi, %ebx # e += (c ^ d ^ b)
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
-# 29
- movl %edx, %edi # c
- xorl %ebp, %edi # ^d
- xorl %ecx, %edi # ^b
- addl -64+4*13(%rsp), %eax # e += RCONST + W[n & 15]
- addl %edi, %eax # e += (c ^ d ^ b)
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
-# 30
- movl %ecx, %edi # c
- xorl %edx, %edi # ^d
- xorl %ebx, %edi # ^b
- addl -64+4*14(%rsp), %ebp # e += RCONST + W[n & 15]
- addl %edi, %ebp # e += (c ^ d ^ b)
- movl %eax, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
-# 31
- movl %ebx, %edi # c
- xorl %ecx, %edi # ^d
- xorl %eax, %edi # ^b
- addl -64+4*15(%rsp), %edx # e += RCONST + W[n & 15]
- addl %edi, %edx # e += (c ^ d ^ b)
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
- pshufd $0xaa, %xmm7, %xmm6
-# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
- movaps %xmm1, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm2, %xmm5
- shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm2 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm2, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm2, %xmm2 # shift left by 1
- psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
-# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm2, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*2(%rsp)
-# 32
- movl %eax, %edi # c
- xorl %ebx, %edi # ^d
- xorl %ebp, %edi # ^b
- addl -64+4*0(%rsp), %ecx # e += RCONST + W[n & 15]
- addl %edi, %ecx # e += (c ^ d ^ b)
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
-# 33
- movl %ebp, %edi # c
- xorl %eax, %edi # ^d
- xorl %edx, %edi # ^b
- addl -64+4*1(%rsp), %ebx # e += RCONST + W[n & 15]
- addl %edi, %ebx # e += (c ^ d ^ b)
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
-# 34
- movl %edx, %edi # c
- xorl %ebp, %edi # ^d
- xorl %ecx, %edi # ^b
- addl -64+4*2(%rsp), %eax # e += RCONST + W[n & 15]
- addl %edi, %eax # e += (c ^ d ^ b)
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
-# 35
- movl %ecx, %edi # c
- xorl %edx, %edi # ^d
- xorl %ebx, %edi # ^b
- addl -64+4*3(%rsp), %ebp # e += RCONST + W[n & 15]
- addl %edi, %ebp # e += (c ^ d ^ b)
- movl %eax, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
-# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
- movaps %xmm2, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm3, %xmm5
- shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm3 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm3, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm3, %xmm3 # shift left by 1
- psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
-# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm3, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*3(%rsp)
-# 36
- movl %ebx, %edi # c
- xorl %ecx, %edi # ^d
- xorl %eax, %edi # ^b
- addl -64+4*4(%rsp), %edx # e += RCONST + W[n & 15]
- addl %edi, %edx # e += (c ^ d ^ b)
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
-# 37
- movl %eax, %edi # c
- xorl %ebx, %edi # ^d
- xorl %ebp, %edi # ^b
- addl -64+4*5(%rsp), %ecx # e += RCONST + W[n & 15]
- addl %edi, %ecx # e += (c ^ d ^ b)
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
-# 38
- movl %ebp, %edi # c
- xorl %eax, %edi # ^d
- xorl %edx, %edi # ^b
- addl -64+4*6(%rsp), %ebx # e += RCONST + W[n & 15]
- addl %edi, %ebx # e += (c ^ d ^ b)
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
-# 39
- movl %edx, %edi # c
- xorl %ebp, %edi # ^d
- xorl %ecx, %edi # ^b
- addl -64+4*7(%rsp), %eax # e += RCONST + W[n & 15]
- addl %edi, %eax # e += (c ^ d ^ b)
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
-# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
- movaps %xmm3, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm0, %xmm5
- shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm0 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm0, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm0, %xmm0 # shift left by 1
- psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
-# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm0, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*0(%rsp)
-# 40
- movl %ebx, %edi # di: b
- movl %ebx, %esi # si: b
- orl %ecx, %edi # di: b | c
- andl %ecx, %esi # si: b & c
- andl %edx, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %ebp # += ((b | c) & d) | (b & c)
- addl -64+4*8(%rsp), %ebp # e += RCONST + W[n & 15]
- movl %eax, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
-# 41
- movl %eax, %edi # di: b
- movl %eax, %esi # si: b
- orl %ebx, %edi # di: b | c
- andl %ebx, %esi # si: b & c
- andl %ecx, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %edx # += ((b | c) & d) | (b & c)
- addl -64+4*9(%rsp), %edx # e += RCONST + W[n & 15]
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
-# 42
- movl %ebp, %edi # di: b
- movl %ebp, %esi # si: b
- orl %eax, %edi # di: b | c
- andl %eax, %esi # si: b & c
- andl %ebx, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %ecx # += ((b | c) & d) | (b & c)
- addl -64+4*10(%rsp), %ecx # e += RCONST + W[n & 15]
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
-# 43
- movl %edx, %edi # di: b
- movl %edx, %esi # si: b
- orl %ebp, %edi # di: b | c
- andl %ebp, %esi # si: b & c
- andl %eax, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %ebx # += ((b | c) & d) | (b & c)
- addl -64+4*11(%rsp), %ebx # e += RCONST + W[n & 15]
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
-# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
- movaps %xmm0, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm1, %xmm5
- shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm1 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm1, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm1, %xmm1 # shift left by 1
- psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
-# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm1, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*1(%rsp)
-# 44
- movl %ecx, %edi # di: b
- movl %ecx, %esi # si: b
- orl %edx, %edi # di: b | c
- andl %edx, %esi # si: b & c
- andl %ebp, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %eax # += ((b | c) & d) | (b & c)
- addl -64+4*12(%rsp), %eax # e += RCONST + W[n & 15]
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
-# 45
- movl %ebx, %edi # di: b
- movl %ebx, %esi # si: b
- orl %ecx, %edi # di: b | c
- andl %ecx, %esi # si: b & c
- andl %edx, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %ebp # += ((b | c) & d) | (b & c)
- addl -64+4*13(%rsp), %ebp # e += RCONST + W[n & 15]
- movl %eax, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
-# 46
- movl %eax, %edi # di: b
- movl %eax, %esi # si: b
- orl %ebx, %edi # di: b | c
- andl %ebx, %esi # si: b & c
- andl %ecx, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %edx # += ((b | c) & d) | (b & c)
- addl -64+4*14(%rsp), %edx # e += RCONST + W[n & 15]
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
-# 47
- movl %ebp, %edi # di: b
- movl %ebp, %esi # si: b
- orl %eax, %edi # di: b | c
- andl %eax, %esi # si: b & c
- andl %ebx, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %ecx # += ((b | c) & d) | (b & c)
- addl -64+4*15(%rsp), %ecx # e += RCONST + W[n & 15]
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
-# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
- movaps %xmm1, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm2, %xmm5
- shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm2 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm2, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm2, %xmm2 # shift left by 1
- psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
-# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm2, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*2(%rsp)
-# 48
- movl %edx, %edi # di: b
- movl %edx, %esi # si: b
- orl %ebp, %edi # di: b | c
- andl %ebp, %esi # si: b & c
- andl %eax, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %ebx # += ((b | c) & d) | (b & c)
- addl -64+4*0(%rsp), %ebx # e += RCONST + W[n & 15]
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
-# 49
- movl %ecx, %edi # di: b
- movl %ecx, %esi # si: b
- orl %edx, %edi # di: b | c
- andl %edx, %esi # si: b & c
- andl %ebp, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %eax # += ((b | c) & d) | (b & c)
- addl -64+4*1(%rsp), %eax # e += RCONST + W[n & 15]
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
-# 50
- movl %ebx, %edi # di: b
- movl %ebx, %esi # si: b
- orl %ecx, %edi # di: b | c
- andl %ecx, %esi # si: b & c
- andl %edx, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %ebp # += ((b | c) & d) | (b & c)
- addl -64+4*2(%rsp), %ebp # e += RCONST + W[n & 15]
- movl %eax, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
-# 51
- movl %eax, %edi # di: b
- movl %eax, %esi # si: b
- orl %ebx, %edi # di: b | c
- andl %ebx, %esi # si: b & c
- andl %ecx, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %edx # += ((b | c) & d) | (b & c)
- addl -64+4*3(%rsp), %edx # e += RCONST + W[n & 15]
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
- pshufd $0xff, %xmm7, %xmm6
-# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
- movaps %xmm2, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm3, %xmm5
- shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm3 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm3, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm3, %xmm3 # shift left by 1
- psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
-# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm3, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*3(%rsp)
-# 52
- movl %ebp, %edi # di: b
- movl %ebp, %esi # si: b
- orl %eax, %edi # di: b | c
- andl %eax, %esi # si: b & c
- andl %ebx, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %ecx # += ((b | c) & d) | (b & c)
- addl -64+4*4(%rsp), %ecx # e += RCONST + W[n & 15]
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
-# 53
- movl %edx, %edi # di: b
- movl %edx, %esi # si: b
- orl %ebp, %edi # di: b | c
- andl %ebp, %esi # si: b & c
- andl %eax, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %ebx # += ((b | c) & d) | (b & c)
- addl -64+4*5(%rsp), %ebx # e += RCONST + W[n & 15]
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
-# 54
- movl %ecx, %edi # di: b
- movl %ecx, %esi # si: b
- orl %edx, %edi # di: b | c
- andl %edx, %esi # si: b & c
- andl %ebp, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %eax # += ((b | c) & d) | (b & c)
- addl -64+4*6(%rsp), %eax # e += RCONST + W[n & 15]
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
-# 55
- movl %ebx, %edi # di: b
- movl %ebx, %esi # si: b
- orl %ecx, %edi # di: b | c
- andl %ecx, %esi # si: b & c
- andl %edx, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %ebp # += ((b | c) & d) | (b & c)
- addl -64+4*7(%rsp), %ebp # e += RCONST + W[n & 15]
- movl %eax, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
-# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
- movaps %xmm3, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm0, %xmm5
- shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm0 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm0, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm0, %xmm0 # shift left by 1
- psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
-# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm0, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*0(%rsp)
-# 56
- movl %eax, %edi # di: b
- movl %eax, %esi # si: b
- orl %ebx, %edi # di: b | c
- andl %ebx, %esi # si: b & c
- andl %ecx, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %edx # += ((b | c) & d) | (b & c)
- addl -64+4*8(%rsp), %edx # e += RCONST + W[n & 15]
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
-# 57
- movl %ebp, %edi # di: b
- movl %ebp, %esi # si: b
- orl %eax, %edi # di: b | c
- andl %eax, %esi # si: b & c
- andl %ebx, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %ecx # += ((b | c) & d) | (b & c)
- addl -64+4*9(%rsp), %ecx # e += RCONST + W[n & 15]
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
-# 58
- movl %edx, %edi # di: b
- movl %edx, %esi # si: b
- orl %ebp, %edi # di: b | c
- andl %ebp, %esi # si: b & c
- andl %eax, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %ebx # += ((b | c) & d) | (b & c)
- addl -64+4*10(%rsp), %ebx # e += RCONST + W[n & 15]
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
-# 59
- movl %ecx, %edi # di: b
- movl %ecx, %esi # si: b
- orl %edx, %edi # di: b | c
- andl %edx, %esi # si: b & c
- andl %ebp, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %eax # += ((b | c) & d) | (b & c)
- addl -64+4*11(%rsp), %eax # e += RCONST + W[n & 15]
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
-# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
- movaps %xmm0, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm1, %xmm5
- shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm1 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm1, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm1, %xmm1 # shift left by 1
- psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
-# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm1, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*1(%rsp)
-# 60
- movl %ecx, %edi # c
- xorl %edx, %edi # ^d
- xorl %ebx, %edi # ^b
- addl -64+4*12(%rsp), %ebp # e += RCONST + W[n & 15]
- addl %edi, %ebp # e += (c ^ d ^ b)
- movl %eax, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
-# 61
- movl %ebx, %edi # c
- xorl %ecx, %edi # ^d
- xorl %eax, %edi # ^b
- addl -64+4*13(%rsp), %edx # e += RCONST + W[n & 15]
- addl %edi, %edx # e += (c ^ d ^ b)
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
-# 62
- movl %eax, %edi # c
- xorl %ebx, %edi # ^d
- xorl %ebp, %edi # ^b
- addl -64+4*14(%rsp), %ecx # e += RCONST + W[n & 15]
- addl %edi, %ecx # e += (c ^ d ^ b)
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
-# 63
- movl %ebp, %edi # c
- xorl %eax, %edi # ^d
- xorl %edx, %edi # ^b
- addl -64+4*15(%rsp), %ebx # e += RCONST + W[n & 15]
- addl %edi, %ebx # e += (c ^ d ^ b)
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
-# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
- movaps %xmm1, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm2, %xmm5
- shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm2 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm2, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm2, %xmm2 # shift left by 1
- psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
-# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm2, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*2(%rsp)
-# 64
- movl %edx, %edi # c
- xorl %ebp, %edi # ^d
- xorl %ecx, %edi # ^b
- addl -64+4*0(%rsp), %eax # e += RCONST + W[n & 15]
- addl %edi, %eax # e += (c ^ d ^ b)
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
-# 65
- movl %ecx, %edi # c
- xorl %edx, %edi # ^d
- xorl %ebx, %edi # ^b
- addl -64+4*1(%rsp), %ebp # e += RCONST + W[n & 15]
- addl %edi, %ebp # e += (c ^ d ^ b)
- movl %eax, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
-# 66
- movl %ebx, %edi # c
- xorl %ecx, %edi # ^d
- xorl %eax, %edi # ^b
- addl -64+4*2(%rsp), %edx # e += RCONST + W[n & 15]
- addl %edi, %edx # e += (c ^ d ^ b)
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
-# 67
- movl %eax, %edi # c
- xorl %ebx, %edi # ^d
- xorl %ebp, %edi # ^b
- addl -64+4*3(%rsp), %ecx # e += RCONST + W[n & 15]
- addl %edi, %ecx # e += (c ^ d ^ b)
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
-# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
- movaps %xmm2, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
-# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm3, %xmm5
- shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm3 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm3, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm3, %xmm3 # shift left by 1
- psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
-# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm3, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*3(%rsp)
-# 68
- movl %ebp, %edi # c
- xorl %eax, %edi # ^d
- xorl %edx, %edi # ^b
- addl -64+4*4(%rsp), %ebx # e += RCONST + W[n & 15]
- addl %edi, %ebx # e += (c ^ d ^ b)
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
-# 69
- movl %edx, %edi # c
- xorl %ebp, %edi # ^d
- xorl %ecx, %edi # ^b
- addl -64+4*5(%rsp), %eax # e += RCONST + W[n & 15]
- addl %edi, %eax # e += (c ^ d ^ b)
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
-# 70
- movl %ecx, %edi # c
- xorl %edx, %edi # ^d
- xorl %ebx, %edi # ^b
- addl -64+4*6(%rsp), %ebp # e += RCONST + W[n & 15]
- addl %edi, %ebp # e += (c ^ d ^ b)
- movl %eax, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
-# 71
- movl %ebx, %edi # c
- xorl %ecx, %edi # ^d
- xorl %eax, %edi # ^b
- addl -64+4*7(%rsp), %edx # e += RCONST + W[n & 15]
- addl %edi, %edx # e += (c ^ d ^ b)
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
-# 72
- movl %eax, %edi # c
- xorl %ebx, %edi # ^d
- xorl %ebp, %edi # ^b
- addl -64+4*8(%rsp), %ecx # e += RCONST + W[n & 15]
- addl %edi, %ecx # e += (c ^ d ^ b)
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
-# 73
- movl %ebp, %edi # c
- xorl %eax, %edi # ^d
- xorl %edx, %edi # ^b
- addl -64+4*9(%rsp), %ebx # e += RCONST + W[n & 15]
- addl %edi, %ebx # e += (c ^ d ^ b)
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
-# 74
- movl %edx, %edi # c
- xorl %ebp, %edi # ^d
- xorl %ecx, %edi # ^b
- addl -64+4*10(%rsp), %eax # e += RCONST + W[n & 15]
- addl %edi, %eax # e += (c ^ d ^ b)
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
-# 75
- movl %ecx, %edi # c
- xorl %edx, %edi # ^d
- xorl %ebx, %edi # ^b
- addl -64+4*11(%rsp), %ebp # e += RCONST + W[n & 15]
- addl %edi, %ebp # e += (c ^ d ^ b)
- movl %eax, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
-# 76
- movl %ebx, %edi # c
- xorl %ecx, %edi # ^d
- xorl %eax, %edi # ^b
- addl -64+4*12(%rsp), %edx # e += RCONST + W[n & 15]
- addl %edi, %edx # e += (c ^ d ^ b)
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
-# 77
- movl %eax, %edi # c
- xorl %ebx, %edi # ^d
- xorl %ebp, %edi # ^b
- addl -64+4*13(%rsp), %ecx # e += RCONST + W[n & 15]
- addl %edi, %ecx # e += (c ^ d ^ b)
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
-# 78
- movl %ebp, %edi # c
- xorl %eax, %edi # ^d
- xorl %edx, %edi # ^b
- addl -64+4*14(%rsp), %ebx # e += RCONST + W[n & 15]
- addl %edi, %ebx # e += (c ^ d ^ b)
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
-# 79
- movl %edx, %edi # c
- xorl %ebp, %edi # ^d
- xorl %ecx, %edi # ^b
- addl -64+4*15(%rsp), %eax # e += RCONST + W[n & 15]
- addl %edi, %eax # e += (c ^ d ^ b)
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
-
- popq %rdi #
- popq %r12 #
- addl %eax, 80(%rdi) # ctx->hash[0] += a
- popq %r13 #
- addl %ebx, 84(%rdi) # ctx->hash[1] += b
- popq %r14 #
- addl %ecx, 88(%rdi) # ctx->hash[2] += c
-# popq %r15 #
- addl %edx, 92(%rdi) # ctx->hash[3] += d
- popq %rbx #
- addl %ebp, 96(%rdi) # ctx->hash[4] += e
- popq %rbp #
-
- ret
- .size sha1_process_block64, .-sha1_process_block64
-
- .section .rodata.cst16.sha1const, "aM", @progbits, 16
- .balign 16
-sha1const:
- .long 0x5A827999
- .long 0x6ED9EBA1
- .long 0x8F1BBCDC
- .long 0xCA62C1D6
-
-#endif