diff options
Diffstat (limited to 'libbb/hash_md5_sha_x86-64.S')
-rw-r--r-- | libbb/hash_md5_sha_x86-64.S | 1489 |
1 files changed, 0 insertions, 1489 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S deleted file mode 100644 index 2cdd220..0000000 --- a/libbb/hash_md5_sha_x86-64.S +++ /dev/null @@ -1,1489 +0,0 @@ -### Generated by hash_md5_sha_x86-64.S.sh ### - -#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) -#ifdef __linux__ - .section .note.GNU-stack, "", @progbits -#endif - .section .text.sha1_process_block64, "ax", @progbits - .globl sha1_process_block64 - .hidden sha1_process_block64 - .type sha1_process_block64, @function - - .balign 8 # allow decoders to fetch at least 5 first insns -sha1_process_block64: - pushq %rbp # 1 byte insn - pushq %rbx # 1 byte insn -# pushq %r15 # 2 byte insn - pushq %r14 # 2 byte insn - pushq %r13 # 2 byte insn - pushq %r12 # 2 byte insn - pushq %rdi # we need ctx at the end - -#Register and stack use: -# eax..edx: a..d -# ebp: e -# esi,edi,r8..r14: temps -# r15: unused -# xmm0..xmm3: W[] -# xmm4,xmm5: temps -# xmm6: current round constant -# xmm7: all round constants -# -64(%rsp): area for passing RCONST + W[] from vector to integer units - - movl 80(%rdi), %eax # a = ctx->hash[0] - movl 84(%rdi), %ebx # b = ctx->hash[1] - movl 88(%rdi), %ecx # c = ctx->hash[2] - movl 92(%rdi), %edx # d = ctx->hash[3] - movl 96(%rdi), %ebp # e = ctx->hash[4] - - movaps sha1const(%rip), %xmm7 - pshufd $0x00, %xmm7, %xmm6 - - # Load W[] to xmm0..3, byteswapping on the fly. - # - # For iterations 0..15, we pass W[] in rsi,r8..r14 - # for use in RD1As instead of spilling them to stack. - # We lose parallelized addition of RCONST, but LEA - # can do two additions at once, so it is probably a wash. - # (We use rsi instead of rN because this makes two - # LEAs in two first RD1As shorter by one byte). - movq 4*0(%rdi), %rsi - movq 4*2(%rdi), %r8 - bswapq %rsi - bswapq %r8 - rolq $32, %rsi # rsi = W[1]:W[0] - rolq $32, %r8 # r8 = W[3]:W[2] - movq %rsi, %xmm0 - movq %r8, %xmm4 - punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) -# movaps %xmm0, %xmm4 # add RCONST, spill to stack -# paddd %xmm6, %xmm4 -# movups %xmm4, -64+16*0(%rsp) - - movq 4*4(%rdi), %r9 - movq 4*6(%rdi), %r10 - bswapq %r9 - bswapq %r10 - rolq $32, %r9 # r9 = W[5]:W[4] - rolq $32, %r10 # r10 = W[7]:W[6] - movq %r9, %xmm1 - movq %r10, %xmm4 - punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) - - movq 4*8(%rdi), %r11 - movq 4*10(%rdi), %r12 - bswapq %r11 - bswapq %r12 - rolq $32, %r11 # r11 = W[9]:W[8] - rolq $32, %r12 # r12 = W[11]:W[10] - movq %r11, %xmm2 - movq %r12, %xmm4 - punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) - - movq 4*12(%rdi), %r13 - movq 4*14(%rdi), %r14 - bswapq %r13 - bswapq %r14 - rolq $32, %r13 # r13 = W[13]:W[12] - rolq $32, %r14 # r14 = W[15]:W[14] - movq %r13, %xmm3 - movq %r14, %xmm4 - punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) - -# 0 - leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] - shrq $32, %rsi - movl %ecx, %edi # c - xorl %edx, %edi # ^d - andl %ebx, %edi # &b - xorl %edx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ebp # e += (((c ^ d) & b) ^ d) - movl %eax, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 1 - leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - andl %eax, %edi # &b - xorl %ecx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %edx # e += (((c ^ d) & b) ^ d) - movl %ebp, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 2 - leal 0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n] - shrq $32, %r8 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - andl %ebp, %edi # &b - xorl %ebx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ecx # e += (((c ^ d) & b) ^ d) - movl %edx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 3 - leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] - movl %ebp, %edi # c - xorl %eax, %edi # ^d - andl %edx, %edi # &b - xorl %eax, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ebx # e += (((c ^ d) & b) ^ d) - movl %ecx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 4 - leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] - shrq $32, %r9 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - andl %ecx, %edi # &b - xorl %ebp, %edi # (((c ^ d) & b) ^ d) - addl %edi, %eax # e += (((c ^ d) & b) ^ d) - movl %ebx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 5 - leal 0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n] - movl %ecx, %edi # c - xorl %edx, %edi # ^d - andl %ebx, %edi # &b - xorl %edx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ebp # e += (((c ^ d) & b) ^ d) - movl %eax, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 6 - leal 0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n] - shrq $32, %r10 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - andl %eax, %edi # &b - xorl %ecx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %edx # e += (((c ^ d) & b) ^ d) - movl %ebp, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 7 - leal 0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n] - movl %eax, %edi # c - xorl %ebx, %edi # ^d - andl %ebp, %edi # &b - xorl %ebx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ecx # e += (((c ^ d) & b) ^ d) - movl %edx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) - movaps %xmm3, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm0, %xmm5 - shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm0 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm0, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm0, %xmm0 # shift left by 1 - psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm0, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*0(%rsp) -# 8 - leal 0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n] - shrq $32, %r11 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - andl %edx, %edi # &b - xorl %eax, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ebx # e += (((c ^ d) & b) ^ d) - movl %ecx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 9 - leal 0x5A827999(%rax,%r11), %eax # e += RCONST + W[n] - movl %edx, %edi # c - xorl %ebp, %edi # ^d - andl %ecx, %edi # &b - xorl %ebp, %edi # (((c ^ d) & b) ^ d) - addl %edi, %eax # e += (((c ^ d) & b) ^ d) - movl %ebx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 10 - leal 0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n] - shrq $32, %r12 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - andl %ebx, %edi # &b - xorl %edx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ebp # e += (((c ^ d) & b) ^ d) - movl %eax, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 11 - leal 0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n] - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - andl %eax, %edi # &b - xorl %ecx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %edx # e += (((c ^ d) & b) ^ d) - movl %ebp, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) - pshufd $0x55, %xmm7, %xmm6 -# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) - movaps %xmm0, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm1, %xmm5 - shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm1 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm1, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm1, %xmm1 # shift left by 1 - psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm1, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*1(%rsp) -# 12 - leal 0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n] - shrq $32, %r13 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - andl %ebp, %edi # &b - xorl %ebx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ecx # e += (((c ^ d) & b) ^ d) - movl %edx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 13 - leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] - movl %ebp, %edi # c - xorl %eax, %edi # ^d - andl %edx, %edi # &b - xorl %eax, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ebx # e += (((c ^ d) & b) ^ d) - movl %ecx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 14 - leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] - shrq $32, %r14 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - andl %ecx, %edi # &b - xorl %ebp, %edi # (((c ^ d) & b) ^ d) - addl %edi, %eax # e += (((c ^ d) & b) ^ d) - movl %ebx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 15 - leal 0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n] - movl %ecx, %edi # c - xorl %edx, %edi # ^d - andl %ebx, %edi # &b - xorl %edx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ebp # e += (((c ^ d) & b) ^ d) - movl %eax, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) - movaps %xmm1, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm2, %xmm5 - shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm2 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm2, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm2, %xmm2 # shift left by 1 - psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm2, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*2(%rsp) -# 16 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - andl %eax, %edi # &b - xorl %ecx, %edi # (((c ^ d) & b) ^ d) - addl -64+4*0(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (((c ^ d) & b) ^ d) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 17 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - andl %ebp, %edi # &b - xorl %ebx, %edi # (((c ^ d) & b) ^ d) - addl -64+4*1(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (((c ^ d) & b) ^ d) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 18 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - andl %edx, %edi # &b - xorl %eax, %edi # (((c ^ d) & b) ^ d) - addl -64+4*2(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (((c ^ d) & b) ^ d) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 19 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - andl %ecx, %edi # &b - xorl %ebp, %edi # (((c ^ d) & b) ^ d) - addl -64+4*3(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (((c ^ d) & b) ^ d) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) - movaps %xmm2, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm3, %xmm5 - shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm3 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm3, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm3, %xmm3 # shift left by 1 - psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm3, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*3(%rsp) -# 20 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - xorl %ebx, %edi # ^b - addl -64+4*4(%rsp), %ebp # e += RCONST + W[n & 15] - addl %edi, %ebp # e += (c ^ d ^ b) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 21 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - xorl %eax, %edi # ^b - addl -64+4*5(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (c ^ d ^ b) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 22 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - xorl %ebp, %edi # ^b - addl -64+4*6(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (c ^ d ^ b) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 23 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - xorl %edx, %edi # ^b - addl -64+4*7(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (c ^ d ^ b) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) - movaps %xmm3, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm0, %xmm5 - shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm0 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm0, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm0, %xmm0 # shift left by 1 - psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm0, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*0(%rsp) -# 24 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - xorl %ecx, %edi # ^b - addl -64+4*8(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (c ^ d ^ b) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 25 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - xorl %ebx, %edi # ^b - addl -64+4*9(%rsp), %ebp # e += RCONST + W[n & 15] - addl %edi, %ebp # e += (c ^ d ^ b) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 26 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - xorl %eax, %edi # ^b - addl -64+4*10(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (c ^ d ^ b) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 27 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - xorl %ebp, %edi # ^b - addl -64+4*11(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (c ^ d ^ b) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) - movaps %xmm0, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm1, %xmm5 - shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm1 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm1, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm1, %xmm1 # shift left by 1 - psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm1, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*1(%rsp) -# 28 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - xorl %edx, %edi # ^b - addl -64+4*12(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (c ^ d ^ b) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 29 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - xorl %ecx, %edi # ^b - addl -64+4*13(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (c ^ d ^ b) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 30 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - xorl %ebx, %edi # ^b - addl -64+4*14(%rsp), %ebp # e += RCONST + W[n & 15] - addl %edi, %ebp # e += (c ^ d ^ b) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 31 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - xorl %eax, %edi # ^b - addl -64+4*15(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (c ^ d ^ b) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) - pshufd $0xaa, %xmm7, %xmm6 -# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) - movaps %xmm1, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm2, %xmm5 - shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm2 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm2, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm2, %xmm2 # shift left by 1 - psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm2, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*2(%rsp) -# 32 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - xorl %ebp, %edi # ^b - addl -64+4*0(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (c ^ d ^ b) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 33 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - xorl %edx, %edi # ^b - addl -64+4*1(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (c ^ d ^ b) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 34 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - xorl %ecx, %edi # ^b - addl -64+4*2(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (c ^ d ^ b) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 35 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - xorl %ebx, %edi # ^b - addl -64+4*3(%rsp), %ebp # e += RCONST + W[n & 15] - addl %edi, %ebp # e += (c ^ d ^ b) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) - movaps %xmm2, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm3, %xmm5 - shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm3 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm3, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm3, %xmm3 # shift left by 1 - psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm3, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*3(%rsp) -# 36 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - xorl %eax, %edi # ^b - addl -64+4*4(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (c ^ d ^ b) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 37 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - xorl %ebp, %edi # ^b - addl -64+4*5(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (c ^ d ^ b) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 38 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - xorl %edx, %edi # ^b - addl -64+4*6(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (c ^ d ^ b) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 39 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - xorl %ecx, %edi # ^b - addl -64+4*7(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (c ^ d ^ b) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) - movaps %xmm3, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm0, %xmm5 - shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm0 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm0, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm0, %xmm0 # shift left by 1 - psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm0, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*0(%rsp) -# 40 - movl %ebx, %edi # di: b - movl %ebx, %esi # si: b - orl %ecx, %edi # di: b | c - andl %ecx, %esi # si: b & c - andl %edx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ebp # += ((b | c) & d) | (b & c) - addl -64+4*8(%rsp), %ebp # e += RCONST + W[n & 15] - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 41 - movl %eax, %edi # di: b - movl %eax, %esi # si: b - orl %ebx, %edi # di: b | c - andl %ebx, %esi # si: b & c - andl %ecx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %edx # += ((b | c) & d) | (b & c) - addl -64+4*9(%rsp), %edx # e += RCONST + W[n & 15] - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 42 - movl %ebp, %edi # di: b - movl %ebp, %esi # si: b - orl %eax, %edi # di: b | c - andl %eax, %esi # si: b & c - andl %ebx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ecx # += ((b | c) & d) | (b & c) - addl -64+4*10(%rsp), %ecx # e += RCONST + W[n & 15] - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 43 - movl %edx, %edi # di: b - movl %edx, %esi # si: b - orl %ebp, %edi # di: b | c - andl %ebp, %esi # si: b & c - andl %eax, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ebx # += ((b | c) & d) | (b & c) - addl -64+4*11(%rsp), %ebx # e += RCONST + W[n & 15] - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) - movaps %xmm0, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm1, %xmm5 - shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm1 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm1, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm1, %xmm1 # shift left by 1 - psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm1, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*1(%rsp) -# 44 - movl %ecx, %edi # di: b - movl %ecx, %esi # si: b - orl %edx, %edi # di: b | c - andl %edx, %esi # si: b & c - andl %ebp, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %eax # += ((b | c) & d) | (b & c) - addl -64+4*12(%rsp), %eax # e += RCONST + W[n & 15] - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 45 - movl %ebx, %edi # di: b - movl %ebx, %esi # si: b - orl %ecx, %edi # di: b | c - andl %ecx, %esi # si: b & c - andl %edx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ebp # += ((b | c) & d) | (b & c) - addl -64+4*13(%rsp), %ebp # e += RCONST + W[n & 15] - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 46 - movl %eax, %edi # di: b - movl %eax, %esi # si: b - orl %ebx, %edi # di: b | c - andl %ebx, %esi # si: b & c - andl %ecx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %edx # += ((b | c) & d) | (b & c) - addl -64+4*14(%rsp), %edx # e += RCONST + W[n & 15] - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 47 - movl %ebp, %edi # di: b - movl %ebp, %esi # si: b - orl %eax, %edi # di: b | c - andl %eax, %esi # si: b & c - andl %ebx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ecx # += ((b | c) & d) | (b & c) - addl -64+4*15(%rsp), %ecx # e += RCONST + W[n & 15] - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) - movaps %xmm1, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm2, %xmm5 - shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm2 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm2, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm2, %xmm2 # shift left by 1 - psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm2, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*2(%rsp) -# 48 - movl %edx, %edi # di: b - movl %edx, %esi # si: b - orl %ebp, %edi # di: b | c - andl %ebp, %esi # si: b & c - andl %eax, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ebx # += ((b | c) & d) | (b & c) - addl -64+4*0(%rsp), %ebx # e += RCONST + W[n & 15] - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 49 - movl %ecx, %edi # di: b - movl %ecx, %esi # si: b - orl %edx, %edi # di: b | c - andl %edx, %esi # si: b & c - andl %ebp, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %eax # += ((b | c) & d) | (b & c) - addl -64+4*1(%rsp), %eax # e += RCONST + W[n & 15] - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 50 - movl %ebx, %edi # di: b - movl %ebx, %esi # si: b - orl %ecx, %edi # di: b | c - andl %ecx, %esi # si: b & c - andl %edx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ebp # += ((b | c) & d) | (b & c) - addl -64+4*2(%rsp), %ebp # e += RCONST + W[n & 15] - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 51 - movl %eax, %edi # di: b - movl %eax, %esi # si: b - orl %ebx, %edi # di: b | c - andl %ebx, %esi # si: b & c - andl %ecx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %edx # += ((b | c) & d) | (b & c) - addl -64+4*3(%rsp), %edx # e += RCONST + W[n & 15] - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) - pshufd $0xff, %xmm7, %xmm6 -# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) - movaps %xmm2, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm3, %xmm5 - shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm3 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm3, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm3, %xmm3 # shift left by 1 - psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm3, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*3(%rsp) -# 52 - movl %ebp, %edi # di: b - movl %ebp, %esi # si: b - orl %eax, %edi # di: b | c - andl %eax, %esi # si: b & c - andl %ebx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ecx # += ((b | c) & d) | (b & c) - addl -64+4*4(%rsp), %ecx # e += RCONST + W[n & 15] - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 53 - movl %edx, %edi # di: b - movl %edx, %esi # si: b - orl %ebp, %edi # di: b | c - andl %ebp, %esi # si: b & c - andl %eax, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ebx # += ((b | c) & d) | (b & c) - addl -64+4*5(%rsp), %ebx # e += RCONST + W[n & 15] - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 54 - movl %ecx, %edi # di: b - movl %ecx, %esi # si: b - orl %edx, %edi # di: b | c - andl %edx, %esi # si: b & c - andl %ebp, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %eax # += ((b | c) & d) | (b & c) - addl -64+4*6(%rsp), %eax # e += RCONST + W[n & 15] - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 55 - movl %ebx, %edi # di: b - movl %ebx, %esi # si: b - orl %ecx, %edi # di: b | c - andl %ecx, %esi # si: b & c - andl %edx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ebp # += ((b | c) & d) | (b & c) - addl -64+4*7(%rsp), %ebp # e += RCONST + W[n & 15] - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) - movaps %xmm3, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm0, %xmm5 - shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm0 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm0, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm0, %xmm0 # shift left by 1 - psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm0, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*0(%rsp) -# 56 - movl %eax, %edi # di: b - movl %eax, %esi # si: b - orl %ebx, %edi # di: b | c - andl %ebx, %esi # si: b & c - andl %ecx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %edx # += ((b | c) & d) | (b & c) - addl -64+4*8(%rsp), %edx # e += RCONST + W[n & 15] - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 57 - movl %ebp, %edi # di: b - movl %ebp, %esi # si: b - orl %eax, %edi # di: b | c - andl %eax, %esi # si: b & c - andl %ebx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ecx # += ((b | c) & d) | (b & c) - addl -64+4*9(%rsp), %ecx # e += RCONST + W[n & 15] - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 58 - movl %edx, %edi # di: b - movl %edx, %esi # si: b - orl %ebp, %edi # di: b | c - andl %ebp, %esi # si: b & c - andl %eax, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ebx # += ((b | c) & d) | (b & c) - addl -64+4*10(%rsp), %ebx # e += RCONST + W[n & 15] - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 59 - movl %ecx, %edi # di: b - movl %ecx, %esi # si: b - orl %edx, %edi # di: b | c - andl %edx, %esi # si: b & c - andl %ebp, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %eax # += ((b | c) & d) | (b & c) - addl -64+4*11(%rsp), %eax # e += RCONST + W[n & 15] - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) - movaps %xmm0, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm1, %xmm5 - shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm1 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm1, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm1, %xmm1 # shift left by 1 - psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm1, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*1(%rsp) -# 60 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - xorl %ebx, %edi # ^b - addl -64+4*12(%rsp), %ebp # e += RCONST + W[n & 15] - addl %edi, %ebp # e += (c ^ d ^ b) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 61 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - xorl %eax, %edi # ^b - addl -64+4*13(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (c ^ d ^ b) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 62 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - xorl %ebp, %edi # ^b - addl -64+4*14(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (c ^ d ^ b) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 63 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - xorl %edx, %edi # ^b - addl -64+4*15(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (c ^ d ^ b) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) - movaps %xmm1, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm2, %xmm5 - shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm2 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm2, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm2, %xmm2 # shift left by 1 - psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm2, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*2(%rsp) -# 64 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - xorl %ecx, %edi # ^b - addl -64+4*0(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (c ^ d ^ b) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 65 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - xorl %ebx, %edi # ^b - addl -64+4*1(%rsp), %ebp # e += RCONST + W[n & 15] - addl %edi, %ebp # e += (c ^ d ^ b) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 66 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - xorl %eax, %edi # ^b - addl -64+4*2(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (c ^ d ^ b) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 67 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - xorl %ebp, %edi # ^b - addl -64+4*3(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (c ^ d ^ b) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) - movaps %xmm2, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm3, %xmm5 - shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm3 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm3, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm3, %xmm3 # shift left by 1 - psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm3, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*3(%rsp) -# 68 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - xorl %edx, %edi # ^b - addl -64+4*4(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (c ^ d ^ b) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 69 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - xorl %ecx, %edi # ^b - addl -64+4*5(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (c ^ d ^ b) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 70 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - xorl %ebx, %edi # ^b - addl -64+4*6(%rsp), %ebp # e += RCONST + W[n & 15] - addl %edi, %ebp # e += (c ^ d ^ b) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 71 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - xorl %eax, %edi # ^b - addl -64+4*7(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (c ^ d ^ b) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 72 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - xorl %ebp, %edi # ^b - addl -64+4*8(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (c ^ d ^ b) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 73 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - xorl %edx, %edi # ^b - addl -64+4*9(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (c ^ d ^ b) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 74 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - xorl %ecx, %edi # ^b - addl -64+4*10(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (c ^ d ^ b) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 75 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - xorl %ebx, %edi # ^b - addl -64+4*11(%rsp), %ebp # e += RCONST + W[n & 15] - addl %edi, %ebp # e += (c ^ d ^ b) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 76 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - xorl %eax, %edi # ^b - addl -64+4*12(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (c ^ d ^ b) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 77 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - xorl %ebp, %edi # ^b - addl -64+4*13(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (c ^ d ^ b) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 78 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - xorl %edx, %edi # ^b - addl -64+4*14(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (c ^ d ^ b) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 79 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - xorl %ecx, %edi # ^b - addl -64+4*15(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (c ^ d ^ b) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) - - popq %rdi # - popq %r12 # - addl %eax, 80(%rdi) # ctx->hash[0] += a - popq %r13 # - addl %ebx, 84(%rdi) # ctx->hash[1] += b - popq %r14 # - addl %ecx, 88(%rdi) # ctx->hash[2] += c -# popq %r15 # - addl %edx, 92(%rdi) # ctx->hash[3] += d - popq %rbx # - addl %ebp, 96(%rdi) # ctx->hash[4] += e - popq %rbp # - - ret - .size sha1_process_block64, .-sha1_process_block64 - - .section .rodata.cst16.sha1const, "aM", @progbits, 16 - .balign 16 -sha1const: - .long 0x5A827999 - .long 0x6ED9EBA1 - .long 0x8F1BBCDC - .long 0xCA62C1D6 - -#endif |