6 files changed, 163 insertions, 115 deletions
diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
index 4b33449..c059fb1 100644
--- a/libbb/hash_md5_sha256_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-32_shaNI.S
@@ -15,6 +15,10 @@
 //#define shuf128_32 pshufd
 #define shuf128_32 shufps
 
+// pshufb and palignr are SSSE3 insns.
+// We do not check SSSE3 in cpuid,
+// all SHA-capable CPUs support it as well.
+
 	.section	.text.sha256_process_block64_shaNI, "ax", @progbits
 	.globl	sha256_process_block64_shaNI
 	.hidden	sha256_process_block64_shaNI
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S
index 5ed80c2..9578441 100644
--- a/libbb/hash_md5_sha256_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-64_shaNI.S
@@ -15,6 +15,10 @@
 //#define shuf128_32 pshufd
 #define shuf128_32 shufps
 
+// pshufb and palignr are SSSE3 insns.
+// We do not check SSSE3 in cpuid,
+// all SHA-capable CPUs support it as well.
+
 	.section	.text.sha256_process_block64_shaNI, "ax", @progbits
 	.globl	sha256_process_block64_shaNI
 	.hidden	sha256_process_block64_shaNI
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S
index c7fb243..2366b04 100644
--- a/libbb/hash_md5_sha_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha_x86-32_shaNI.S
@@ -20,6 +20,11 @@
 #define extr128_32 pextrd
 //#define extr128_32 extractps	# not shorter
 
+// pshufb is a SSSE3 insn.
+// pinsrd, pextrd, extractps are SSE4.1 insns.
+// We do not check SSSE3/SSE4.1 in cpuid,
+// all SHA-capable CPUs support them as well.
+
 	.section	.text.sha1_process_block64_shaNI, "ax", @progbits
 	.globl	sha1_process_block64_shaNI
 	.hidden	sha1_process_block64_shaNI
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index 51fde08..f0daa30 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -27,60 +27,68 @@ sha1_process_block64:
 # xmm7: all round constants
 # -64(%rsp): area for passing RCONST + W[] from vector to integer units
 
+	movl	80(%rdi), %eax		# a = ctx->hash[0]
+	movl	84(%rdi), %ebx		# b = ctx->hash[1]
+	movl	88(%rdi), %ecx		# c = ctx->hash[2]
+	movl	92(%rdi), %edx		# d = ctx->hash[3]
+	movl	96(%rdi), %ebp		# e = ctx->hash[4]
+
 	movaps	sha1const(%rip), %xmm7
-	movaps	bswap32_mask(%rip), %xmm4
 	pshufd	$0x00, %xmm7, %xmm6
 
 	# Load W[] to xmm0..3, byteswapping on the fly.
 	#
-	# For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14
+	# For iterations 0..15, we pass W[] in rsi,r8..r14
 	# for use in RD1As instead of spilling them to stack.
+	# We lose parallelized addition of RCONST, but LEA
+	# can do two additions at once, so it is probably a wash.
 	# (We use rsi instead of rN because this makes two
-	# ADDs in two first RD1As shorter by one byte).
-	movups	16*0(%rdi), %xmm0
-	pshufb	%xmm4, %xmm0
-	movaps	%xmm0, %xmm5
-	paddd	%xmm6, %xmm5
-	movq	%xmm5, %rsi
-#	pextrq	$1, %xmm5, %r8	#SSE4.1 insn
-#	movhpd	%xmm5, %r8		#can only move to mem, not to reg
-	shufps	$0x0e, %xmm5, %xmm5
-	movq	%xmm5, %r8
-
-	movups	16*1(%rdi), %xmm1
-	pshufb	%xmm4, %xmm1
-	movaps	%xmm1, %xmm5
-	paddd	%xmm6, %xmm5
-	movq	%xmm5, %r9
-	shufps	$0x0e, %xmm5, %xmm5
-	movq	%xmm5, %r10
+	# LEAs in two first RD1As shorter by one byte).
+	movq	4*0(%rdi), %rsi
+	movq	4*2(%rdi), %r8
+	bswapq	%rsi
+	bswapq	%r8
+	rolq	$32, %rsi		# rsi = W[1]:W[0]
+	rolq	$32, %r8		# r8  = W[3]:W[2]
+	movq	%rsi, %xmm0
+	movq	%r8, %xmm4
+	punpcklqdq %xmm4, %xmm0	# xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
+#	movaps	%xmm0, %xmm4		# add RCONST, spill to stack
+#	paddd	%xmm6, %xmm4
+#	movups	%xmm4, -64+16*0(%rsp)
 
-	movups	16*2(%rdi), %xmm2
-	pshufb	%xmm4, %xmm2
-	movaps	%xmm2, %xmm5
-	paddd	%xmm6, %xmm5
-	movq	%xmm5, %r11
-	shufps	$0x0e, %xmm5, %xmm5
-	movq	%xmm5, %r12
+	movq	4*4(%rdi), %r9
+	movq	4*6(%rdi), %r10
+	bswapq	%r9
+	bswapq	%r10
+	rolq	$32, %r9		# r9  = W[5]:W[4]
+	rolq	$32, %r10		# r10 = W[7]:W[6]
+	movq	%r9, %xmm1
+	movq	%r10, %xmm4
+	punpcklqdq %xmm4, %xmm1	# xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
 
-	movups	16*3(%rdi), %xmm3
-	pshufb	%xmm4, %xmm3
-	movaps	%xmm3, %xmm5
-	paddd	%xmm6, %xmm5
-	movq	%xmm5, %r13
-	shufps	$0x0e, %xmm5, %xmm5
-	movq	%xmm5, %r14
+	movq	4*8(%rdi), %r11
+	movq	4*10(%rdi), %r12
+	bswapq	%r11
+	bswapq	%r12
+	rolq	$32, %r11		# r11  = W[9]:W[8]
+	rolq	$32, %r12		# r12  = W[11]:W[10]
+	movq	%r11, %xmm2
+	movq	%r12, %xmm4
+	punpcklqdq %xmm4, %xmm2	# xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
 
-	# MOVQs to GPRs (above) have somewhat high latency.
-	# Load hash[] while they are completing:
-	movl	80(%rdi), %eax		# a = ctx->hash[0]
-	movl	84(%rdi), %ebx		# b = ctx->hash[1]
-	movl	88(%rdi), %ecx		# c = ctx->hash[2]
-	movl	92(%rdi), %edx		# d = ctx->hash[3]
-	movl	96(%rdi), %ebp		# e = ctx->hash[4]
+	movq	4*12(%rdi), %r13
+	movq	4*14(%rdi), %r14
+	bswapq	%r13
+	bswapq	%r14
+	rolq	$32, %r13		# r13  = W[13]:W[12]
+	rolq	$32, %r14		# r14  = W[15]:W[14]
+	movq	%r13, %xmm3
+	movq	%r14, %xmm4
+	punpcklqdq %xmm4, %xmm3	# xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
 
 # 0
-	addl	%esi, %ebp		# e += RCONST + W[n]
+	leal	0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
 	shrq	$32, %rsi
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
@@ -92,7 +100,7 @@ sha1_process_block64:
 	addl	%edi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 1
-	addl	%esi, %edx		# e += RCONST + W[n]
+	leal	0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	andl	%eax, %edi		# &b
@@ -103,7 +111,7 @@ sha1_process_block64:
 	addl	%edi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
 # 2
-	addl	%r8d, %ecx		# e += RCONST + W[n]
+	leal	0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n]
 	shrq	$32, %r8
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
@@ -115,7 +123,7 @@ sha1_process_block64:
 	addl	%edi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
 # 3
-	addl	%r8d, %ebx		# e += RCONST + W[n]
+	leal	0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	andl	%edx, %edi		# &b
@@ -126,7 +134,7 @@ sha1_process_block64:
 	addl	%edi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 4
-	addl	%r9d, %eax		# e += RCONST + W[n]
+	leal	0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
 	shrq	$32, %r9
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
@@ -138,7 +146,7 @@ sha1_process_block64:
 	addl	%edi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 5
-	addl	%r9d, %ebp		# e += RCONST + W[n]
+	leal	0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n]
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	andl	%ebx, %edi		# &b
@@ -149,7 +157,7 @@ sha1_process_block64:
 	addl	%edi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 6
-	addl	%r10d, %edx		# e += RCONST + W[n]
+	leal	0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n]
 	shrq	$32, %r10
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
@@ -161,7 +169,7 @@ sha1_process_block64:
 	addl	%edi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
 # 7
-	addl	%r10d, %ecx		# e += RCONST + W[n]
+	leal	0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n]
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	andl	%ebp, %edi		# &b
@@ -202,7 +210,7 @@ sha1_process_block64:
 	paddd	%xmm6, %xmm5
 	movups	%xmm5, -64+16*0(%rsp)
 # 8
-	addl	%r11d, %ebx		# e += RCONST + W[n]
+	leal	0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n]
 	shrq	$32, %r11
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
@@ -214,7 +222,7 @@ sha1_process_block64:
 	addl	%edi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 9
-	addl	%r11d, %eax		# e += RCONST + W[n]
+	leal	0x5A827999(%rax,%r11), %eax # e += RCONST + W[n]
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	andl	%ecx, %edi		# &b
@@ -225,7 +233,7 @@ sha1_process_block64:
 	addl	%edi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 10
-	addl	%r12d, %ebp		# e += RCONST + W[n]
+	leal	0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n]
 	shrq	$32, %r12
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
@@ -237,7 +245,7 @@ sha1_process_block64:
 	addl	%edi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 11
-	addl	%r12d, %edx		# e += RCONST + W[n]
+	leal	0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n]
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	andl	%eax, %edi		# &b
@@ -279,7 +287,7 @@ sha1_process_block64:
 	paddd	%xmm6, %xmm5
 	movups	%xmm5, -64+16*1(%rsp)
 # 12
-	addl	%r13d, %ecx		# e += RCONST + W[n]
+	leal	0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n]
 	shrq	$32, %r13
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
@@ -291,7 +299,7 @@ sha1_process_block64:
 	addl	%edi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
 # 13
-	addl	%r13d, %ebx		# e += RCONST + W[n]
+	leal	0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	andl	%edx, %edi		# &b
@@ -302,7 +310,7 @@ sha1_process_block64:
 	addl	%edi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 14
-	addl	%r14d, %eax		# e += RCONST + W[n]
+	leal	0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
 	shrq	$32, %r14
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
@@ -314,7 +322,7 @@ sha1_process_block64:
 	addl	%edi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 15
-	addl	%r14d, %ebp		# e += RCONST + W[n]
+	leal	0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n]
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	andl	%ebx, %edi		# &b
@@ -1467,11 +1475,6 @@ sha1_process_block64:
 	ret
 	.size	sha1_process_block64, .-sha1_process_block64
 
-	.section	.rodata.cst16.bswap32_mask, "aM", @progbits, 16
-	.balign	16
-bswap32_mask:
-	.octa	0x0c0d0e0f08090a0b0405060700010203
-
 	.section	.rodata.cst16.sha1const, "aM", @progbits, 16
 	.balign	16
 sha1const:
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index f34e6e6..57e77b1 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -99,6 +99,30 @@ INTERLEAVE() {
 	)
 }
 
+#	movaps  bswap32_mask(%rip), $xmmT1
+# Load W[] to xmm0..3, byteswapping on the fly.
+# For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14
+# for use in RD1As instead of spilling them to stack.
+# (We use rsi instead of rN because this makes two
+# ADDs in two first RD1As shorter by one byte).
+#	movups	16*0(%rdi), %xmm0
+#	pshufb	$xmmT1, %xmm0		#SSSE3 insn
+#	movaps	%xmm0, $xmmT2
+#	paddd	$xmmRCONST, $xmmT2
+#	movq	$xmmT2, %rsi
+#	#pextrq	\$1, $xmmT2, %r8        #SSE4.1 insn
+#	#movhpd	$xmmT2, %r8             #can only move to mem, not to reg
+#	shufps	\$0x0e, $xmmT2, $xmmT2	# have to use two-insn sequence
+#	movq	$xmmT2, %r8		# instead
+#	...
+#	<repeat for xmm1,2,3>
+#	...
+#-	leal	$RCONST(%r$e,%rsi), %e$e	# e += RCONST + W[n]
+#+	addl	%esi, %e$e			# e += RCONST + W[n]
+# ^^^^^^^^^^^^^^^^^^^^^^^^
+# The above is -97 bytes of code...
+# ...but pshufb is a SSSE3 insn. Can't use it.
+
 echo \
 "### Generated by hash_md5_sha_x86-64.S.sh ###
 
@@ -129,57 +153,65 @@ sha1_process_block64:
 # xmm7: all round constants
 # -64(%rsp): area for passing RCONST + W[] from vector to integer units
 
+	movl	80(%rdi), %eax		# a = ctx->hash[0]
+	movl	84(%rdi), %ebx		# b = ctx->hash[1]
+	movl	88(%rdi), %ecx		# c = ctx->hash[2]
+	movl	92(%rdi), %edx		# d = ctx->hash[3]
+	movl	96(%rdi), %ebp		# e = ctx->hash[4]
+
 	movaps	sha1const(%rip), $xmmALLRCONST
-	movaps	bswap32_mask(%rip), $xmmT1
 	pshufd	\$0x00, $xmmALLRCONST, $xmmRCONST
 
 	# Load W[] to xmm0..3, byteswapping on the fly.
 	#
-	# For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14
+	# For iterations 0..15, we pass W[] in rsi,r8..r14
 	# for use in RD1As instead of spilling them to stack.
+	# We lose parallelized addition of RCONST, but LEA
+	# can do two additions at once, so it is probably a wash.
 	# (We use rsi instead of rN because this makes two
-	# ADDs in two first RD1As shorter by one byte).
-	movups	16*0(%rdi), %xmm0
-	pshufb	$xmmT1, %xmm0
-	movaps	%xmm0, $xmmT2
-	paddd	$xmmRCONST, $xmmT2
-	movq	$xmmT2, %rsi
-#	pextrq	\$1, $xmmT2, %r8	#SSE4.1 insn
-#	movhpd	$xmmT2, %r8		#can only move to mem, not to reg
-	shufps	\$0x0e, $xmmT2, $xmmT2
-	movq	$xmmT2, %r8
-
-	movups	16*1(%rdi), %xmm1
-	pshufb	$xmmT1, %xmm1
-	movaps	%xmm1, $xmmT2
-	paddd	$xmmRCONST, $xmmT2
-	movq	$xmmT2, %r9
-	shufps	\$0x0e, $xmmT2, $xmmT2
-	movq	$xmmT2, %r10
-
-	movups	16*2(%rdi), %xmm2
-	pshufb	$xmmT1, %xmm2
-	movaps	%xmm2, $xmmT2
-	paddd	$xmmRCONST, $xmmT2
-	movq	$xmmT2, %r11
-	shufps	\$0x0e, $xmmT2, $xmmT2
-	movq	$xmmT2, %r12
-
-	movups	16*3(%rdi), %xmm3
-	pshufb	$xmmT1, %xmm3
-	movaps	%xmm3, $xmmT2
-	paddd	$xmmRCONST, $xmmT2
-	movq	$xmmT2, %r13
-	shufps	\$0x0e, $xmmT2, $xmmT2
-	movq	$xmmT2, %r14
-
-	# MOVQs to GPRs (above) have somewhat high latency.
-	# Load hash[] while they are completing:
-	movl	80(%rdi), %eax		# a = ctx->hash[0]
-	movl	84(%rdi), %ebx		# b = ctx->hash[1]
-	movl	88(%rdi), %ecx		# c = ctx->hash[2]
-	movl	92(%rdi), %edx		# d = ctx->hash[3]
-	movl	96(%rdi), %ebp		# e = ctx->hash[4]
+	# LEAs in two first RD1As shorter by one byte).
+	movq	4*0(%rdi), %rsi
+	movq	4*2(%rdi), %r8
+	bswapq	%rsi
+	bswapq	%r8
+	rolq	\$32, %rsi		# rsi = W[1]:W[0]
+	rolq	\$32, %r8		# r8  = W[3]:W[2]
+	movq	%rsi, %xmm0
+	movq	%r8, $xmmT1
+	punpcklqdq $xmmT1, %xmm0	# xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
+#	movaps	%xmm0, $xmmT1		# add RCONST, spill to stack
+#	paddd	$xmmRCONST, $xmmT1
+#	movups	$xmmT1, -64+16*0(%rsp)
+
+	movq	4*4(%rdi), %r9
+	movq	4*6(%rdi), %r10
+	bswapq	%r9
+	bswapq	%r10
+	rolq	\$32, %r9		# r9  = W[5]:W[4]
+	rolq	\$32, %r10		# r10 = W[7]:W[6]
+	movq	%r9, %xmm1
+	movq	%r10, $xmmT1
+	punpcklqdq $xmmT1, %xmm1	# xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
+
+	movq	4*8(%rdi), %r11
+	movq	4*10(%rdi), %r12
+	bswapq	%r11
+	bswapq	%r12
+	rolq	\$32, %r11		# r11  = W[9]:W[8]
+	rolq	\$32, %r12		# r12  = W[11]:W[10]
+	movq	%r11, %xmm2
+	movq	%r12, $xmmT1
+	punpcklqdq $xmmT1, %xmm2	# xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
+
+	movq	4*12(%rdi), %r13
+	movq	4*14(%rdi), %r14
+	bswapq	%r13
+	bswapq	%r14
+	rolq	\$32, %r13		# r13  = W[13]:W[12]
+	rolq	\$32, %r14		# r14  = W[15]:W[14]
+	movq	%r13, %xmm3
+	movq	%r14, $xmmT1
+	punpcklqdq $xmmT1, %xmm3	# xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
 "
 
 PREP() {
@@ -258,15 +290,15 @@ local rN=$((7+n0/2))
 echo "
 # $n
 ";test $n0 = 0 && echo "
-	addl	%esi, %e$e		# e += RCONST + W[n]
+	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
 	shrq	\$32, %rsi
 ";test $n0 = 1 && echo "
-	addl	%esi, %e$e		# e += RCONST + W[n]
+	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
 ";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo "
-	addl	%r${rN}d, %e$e		# e += RCONST + W[n]
+	leal	$RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
 	shrq	\$32, %r$rN
 ";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo "
-	addl	%r${rN}d, %e$e		# e += RCONST + W[n]
+	leal	$RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
 ";echo "
 	movl	%e$c, %edi		# c
 	xorl	%e$d, %edi		# ^d
@@ -432,11 +464,6 @@ echo "
 	ret
 	.size	sha1_process_block64, .-sha1_process_block64
 
-	.section	.rodata.cst16.bswap32_mask, \"aM\", @progbits, 16
-	.balign	16
-bswap32_mask:
-	.octa	0x0c0d0e0f08090a0b0405060700010203
-
 	.section	.rodata.cst16.sha1const, \"aM\", @progbits, 16
 	.balign	16
 sha1const:
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S
index c13cdec..794e970 100644
--- a/libbb/hash_md5_sha_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha_x86-64_shaNI.S
@@ -20,6 +20,11 @@
 #define extr128_32 pextrd
 //#define extr128_32 extractps	# not shorter
 
+// pshufb is a SSSE3 insn.
+// pinsrd, pextrd, extractps are SSE4.1 insns.
+// We do not check SSSE3/SSE4.1 in cpuid,
+// all SHA-capable CPUs support them as well.
+
 	.section	.text.sha1_process_block64_shaNI, "ax", @progbits
 	.globl	sha1_process_block64_shaNI
 	.hidden	sha1_process_block64_shaNI