diff options
-rw-r--r-- | libbb/Kbuild.src | 1 | ||||
-rw-r--r-- | libbb/hash_md5_sha.c | 21 | ||||
-rw-r--r-- | libbb/hash_md5_sha_x86-32_shaNI.S | 231 |
3 files changed, 252 insertions, 1 deletions
diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src index a3db02b..e8bb24f 100644 --- a/libbb/Kbuild.src +++ b/libbb/Kbuild.src @@ -58,6 +58,7 @@ lib-y += makedev.o lib-y += hash_md5_sha.o lib-y += hash_md5_sha_x86-64.o lib-y += hash_md5_sha_x86-64_shaNI.o +lib-y += hash_md5_sha_x86-32_shaNI.o # Alternative (disabled) MD5 implementation #lib-y += hash_md5prime.o lib-y += messages.o diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index 4c6904b..0b3af72 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c @@ -1143,6 +1143,25 @@ static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx) #endif /* NEED_SHA512 */ #if ENABLE_SHA1_HWACCEL +# if defined(__GNUC__) && defined(__i386__) +static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) +{ + asm ( + " cpuid\n" + : "=a"(*eax), /* Output */ + "=b"(*ebx), + "=c"(*ecx), + "=d"(*edx) + : "0"(*eax), /* Input */ + "1"(*ebx), + "2"(*ecx), + "3"(*edx) + /* No clobbered registers */ + ); +} +struct ASM_expects_76_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 76)]; }; +void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx); +# endif # if defined(__GNUC__) && defined(__x86_64__) static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) { @@ -1174,7 +1193,7 @@ void FAST_FUNC sha1_begin(sha1_ctx_t *ctx) ctx->total64 = 0; ctx->process_block = sha1_process_block64; #if ENABLE_SHA1_HWACCEL -# if defined(__GNUC__) && defined(__x86_64__) +# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) { static smallint shaNI; if (!shaNI) { diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S new file mode 100644 index 0000000..7202c76 --- /dev/null +++ b/libbb/hash_md5_sha_x86-32_shaNI.S @@ -0,0 +1,231 @@ +#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__i386__) +/* The code is adapted from Linux kernel's source */ + +// We use shorter insns, even though they are for "wrong" +// data type (fp, not int). +// For Intel, there is no penalty for doing it at all +// (CPUs which do have such penalty do not support SHA1 insns). +// For AMD, the penalty is one extra cycle +// (allegedly: I failed to find measurable difference). + +//#define mova128 movdqa +#define mova128 movaps +//#define movu128 movdqu +#define movu128 movups +//#define xor128 pxor +#define xor128 xorps +//#define shuf128_32 pshufd +#define shuf128_32 shufps + +#define extr128_32 pextrd +//#define extr128_32 extractps # not shorter + + .section .text.sha1_process_block64_shaNI,"ax",@progbits + .globl sha1_process_block64_shaNI + .hidden sha1_process_block64_shaNI + .type sha1_process_block64_shaNI, @function + +#define ABCD %xmm0 +#define E0 %xmm1 /* Need two E's b/c they ping pong */ +#define E1 %xmm2 +#define MSG0 %xmm3 +#define MSG1 %xmm4 +#define MSG2 %xmm5 +#define MSG3 %xmm6 +#define SHUF_MASK %xmm7 + + .balign 8 # allow decoders to fetch at least 2 first insns +sha1_process_block64_shaNI: + pushl %ebp + movl %esp, %ebp + subl $32, %esp + andl $~0xF, %esp # paddd needs aligned memory operand + + /* load initial hash values */ + xor128 E0, E0 + movu128 76(%eax), ABCD + pinsrd $3, 76+4*4(%eax), E0 # load to upper 32-bit word + shuf128_32 $0x1B, ABCD, ABCD # 00011011: bswap + + mova128 PSHUFFLE_BYTE_FLIP_MASK, SHUF_MASK + + /* Save hash values for addition after rounds */ + movu128 E0, 16(%esp) + movu128 ABCD, (%esp) + + /* Rounds 0-3 */ + movu128 0*16(%eax), MSG0 + pshufb SHUF_MASK, MSG0 + paddd MSG0, E0 + mova128 ABCD, E1 + sha1rnds4 $0, E0, ABCD + + /* Rounds 4-7 */ + movu128 1*16(%eax), MSG1 + pshufb SHUF_MASK, MSG1 + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1rnds4 $0, E1, ABCD + sha1msg1 MSG1, MSG0 + + /* Rounds 8-11 */ + movu128 2*16(%eax), MSG2 + pshufb SHUF_MASK, MSG2 + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1rnds4 $0, E0, ABCD + sha1msg1 MSG2, MSG1 + xor128 MSG2, MSG0 + + /* Rounds 12-15 */ + movu128 3*16(%eax), MSG3 + pshufb SHUF_MASK, MSG3 + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $0, E1, ABCD + sha1msg1 MSG3, MSG2 + xor128 MSG3, MSG1 + + /* Rounds 16-19 */ + sha1nexte MSG0, E0 + mova128 ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $0, E0, ABCD + sha1msg1 MSG0, MSG3 + xor128 MSG0, MSG2 + + /* Rounds 20-23 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG1, MSG0 + xor128 MSG1, MSG3 + + /* Rounds 24-27 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $1, E0, ABCD + sha1msg1 MSG2, MSG1 + xor128 MSG2, MSG0 + + /* Rounds 28-31 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG3, MSG2 + xor128 MSG3, MSG1 + + /* Rounds 32-35 */ + sha1nexte MSG0, E0 + mova128 ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $1, E0, ABCD + sha1msg1 MSG0, MSG3 + xor128 MSG0, MSG2 + + /* Rounds 36-39 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG1, MSG0 + xor128 MSG1, MSG3 + + /* Rounds 40-43 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG2, MSG1 + xor128 MSG2, MSG0 + + /* Rounds 44-47 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $2, E1, ABCD + sha1msg1 MSG3, MSG2 + xor128 MSG3, MSG1 + + /* Rounds 48-51 */ + sha1nexte MSG0, E0 + mova128 ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG0, MSG3 + xor128 MSG0, MSG2 + + /* Rounds 52-55 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $2, E1, ABCD + sha1msg1 MSG1, MSG0 + xor128 MSG1, MSG3 + + /* Rounds 56-59 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG2, MSG1 + xor128 MSG2, MSG0 + + /* Rounds 60-63 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $3, E1, ABCD + sha1msg1 MSG3, MSG2 + xor128 MSG3, MSG1 + + /* Rounds 64-67 */ + sha1nexte MSG0, E0 + mova128 ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $3, E0, ABCD + sha1msg1 MSG0, MSG3 + xor128 MSG0, MSG2 + + /* Rounds 68-71 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $3, E1, ABCD + xor128 MSG1, MSG3 + + /* Rounds 72-75 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $3, E0, ABCD + + /* Rounds 76-79 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1rnds4 $3, E1, ABCD + + /* Add current hash values with previously saved */ + sha1nexte 16(%esp), E0 + paddd (%esp), ABCD + + /* Write hash values back in the correct order */ + shuf128_32 $0x1B, ABCD, ABCD + movu128 ABCD, 76(%eax) + extr128_32 $3, E0, 76+4*4(%eax) + + movl %ebp, %esp + popl %ebp + ret + .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI + +.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 +.align 16 +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x000102030405060708090a0b0c0d0e0f + +#endif |