summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko2021-12-30 13:07:12 +0100
committerDenys Vlasenko2021-12-30 13:17:16 +0100
commit25aadc893d21b35f7d34a9d1edc843632e7abd8f (patch)
treef8d2d860f06eeccb254e071e39213bc2a3f723ef
parent9173c9cce48dc4c867fb06bb72e8c762740c5c86 (diff)
downloadbusybox-25aadc893d21b35f7d34a9d1edc843632e7abd8f.zip
busybox-25aadc893d21b35f7d34a9d1edc843632e7abd8f.tar.gz
libbb/sha1: add config-selectable fully unrolled version, closes 14391
function old new delta sha1_process_block64 364 4167 +3803 static.rconsts 16 - -16 ------------------------------------------------------------------------------ (add/remove: 0/1 grow/shrink: 1/0 up/down: 3803/-16) Total: 3787 bytes Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--libbb/Config.src25
-rw-r--r--libbb/hash_md5_sha.c84
2 files changed, 95 insertions, 14 deletions
diff --git a/libbb/Config.src b/libbb/Config.src
index 24b31fa..13188ef 100644
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -42,21 +42,32 @@ config MD5_SMALL
default 1 # all "fast or small" options default to small
range 0 3
help
- Trade binary size versus speed for the md5sum algorithm.
+ Trade binary size versus speed for the md5 algorithm.
Approximate values running uClibc and hashing
linux-2.4.4.tar.bz2 were:
- value user times (sec) text size (386)
- 0 (fastest) 1.1 6144
- 1 1.4 5392
- 2 3.0 5088
- 3 (smallest) 5.1 4912
+ value user times (sec) text size (386)
+ 0 (fastest) 1.1 6144
+ 1 1.4 5392
+ 2 3.0 5088
+ 3 (smallest) 5.1 4912
+
+config SHA1_SMALL
+ int "SHA1: Trade bytes for speed (0:fast, 3:slow)"
+ default 3 # all "fast or small" options default to small
+ range 0 3
+ help
+ Trade binary size versus speed for the sha1 algorithm.
+ throughput MB/s size of sha1_process_block64
+ value 486 x86-64 486 x86-64
+ 0 339 374 4149 4167
+ 1,2,3 200 195 358 380
config SHA3_SMALL
int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
default 1 # all "fast or small" options default to small
range 0 1
help
- Trade binary size versus speed for the sha3sum algorithm.
+ Trade binary size versus speed for the sha3 algorithm.
SHA3_SMALL=0 compared to SHA3_SMALL=1 (approximate):
64-bit x86: +270 bytes of code, 45% faster
32-bit x86: +450 bytes of code, 75% faster
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index a468397..75673e3 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -390,7 +390,6 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
-# undef OP
# endif
/* Add checksum to the starting values */
ctx->hash[0] += A;
@@ -399,6 +398,7 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
ctx->hash[3] += D;
#endif
}
+#undef OP
#undef FF
#undef FG
#undef FH
@@ -490,18 +490,87 @@ unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
* then rebuild and compare "shaNNNsum bigfile" results.
*/
+#if CONFIG_SHA1_SMALL == 0
+/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86.
+ * It seems further speedup can be achieved by handling more than
+ * 64 bytes per one function call (coreutils does that).
+ */
+static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
+{
+ static const uint32_t rconsts[] ALIGN4 = {
+ 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
+ };
+ uint32_t W[16];
+ uint32_t a, b, c, d, e;
+
+ a = ctx->hash[0];
+ b = ctx->hash[1];
+ c = ctx->hash[2];
+ d = ctx->hash[3];
+ e = ctx->hash[4];
+
+#undef OP
+#define OP(A,B,C,D,E, n) \
+ do { \
+ uint32_t work = EXPR(B, C, D); \
+ if (n <= 15) \
+ work += W[n & 0xf] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
+ if (n >= 16) \
+ work += W[n & 0xf] = rotl32(W[(n+13) & 0xf] ^ W[(n+8) & 0xf] ^ W[(n+2) & 0xf] ^ W[n & 0xf], 1); \
+ E += work + rotl32(A, 5) + rconsts[n / 20]; \
+ B = rotl32(B, 30); \
+ } while (0)
+#define OP20(n) \
+ OP(a,b,c,d,e, (n+ 0)); OP(e,a,b,c,d, (n+ 1)); OP(d,e,a,b,c, (n+ 2)); OP(c,d,e,a,b, (n+ 3)); OP(b,c,d,e,a, (n+ 4)); \
+ OP(a,b,c,d,e, (n+ 5)); OP(e,a,b,c,d, (n+ 6)); OP(d,e,a,b,c, (n+ 7)); OP(c,d,e,a,b, (n+ 8)); OP(b,c,d,e,a, (n+ 9)); \
+ OP(a,b,c,d,e, (n+10)); OP(e,a,b,c,d, (n+11)); OP(d,e,a,b,c, (n+12)); OP(c,d,e,a,b, (n+13)); OP(b,c,d,e,a, (n+14)); \
+ OP(a,b,c,d,e, (n+15)); OP(e,a,b,c,d, (n+16)); OP(d,e,a,b,c, (n+17)); OP(c,d,e,a,b, (n+18)); OP(b,c,d,e,a, (n+19))
+
+ /* 4 rounds of 20 operations each */
+#define EXPR(b,c,d) (((c ^ d) & b) ^ d)
+ OP20(0);
+#undef EXPR
+#define EXPR(b,c,d) (c ^ d ^ b)
+ OP20(20);
+#undef EXPR
+#define EXPR(b,c,d) (((b | c) & d) | (b & c))
+ OP20(40);
+#undef EXPR
+#define EXPR(b,c,d) (c ^ d ^ b)
+ OP20(60);
+
+#undef EXPR
+#undef OP
+#undef OP20
+
+ ctx->hash[0] += a;
+ ctx->hash[1] += b;
+ ctx->hash[2] += c;
+ ctx->hash[3] += d;
+ ctx->hash[4] += e;
+}
+#else
+/* TODO: for CONFIG_SHA1_SMALL == 1, have a partially unrolled version? */
+
+/* Compact version, almost twice as slow as fully unrolled */
static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
{
static const uint32_t rconsts[] ALIGN4 = {
0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
};
int i, j;
- int cnt;
+ int n;
uint32_t W[16+16];
uint32_t a, b, c, d, e;
/* On-stack work buffer frees up one register in the main loop
- * which otherwise will be needed to hold ctx pointer */
+ * which otherwise will be needed to hold ctx pointer.
+ *
+ * The compiler is not smart enough to realize it, though. :(
+ * If __attribute__((optimize("2"))) is added to the function,
+ * only then gcc-9.3.1 spills "ctx" to stack and uses the freed
+ * register (making code 6 bytes smaller, not just faster).
+ */
for (i = 0; i < 16; i++)
W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
@@ -512,7 +581,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
e = ctx->hash[4];
/* 4 rounds of 20 operations each */
- cnt = 0;
+ n = 0;
for (i = 0; i < 4; i++) {
j = 19;
do {
@@ -529,9 +598,9 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
else /* i = 1 or 3 */
work ^= b;
ge16:
- W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1);
+ W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
}
- work += W[cnt];
+ work += W[n];
work += e + rotl32(a, 5) + rconsts[i];
/* Rotate by one for next time */
@@ -540,7 +609,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
c = rotl32(b, 30);
b = a;
a = work;
- cnt = (cnt + 1) & 15;
+ n = (n + 1) & 15;
} while (--j >= 0);
}
@@ -550,6 +619,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
ctx->hash[3] += d;
ctx->hash[4] += e;
}
+#endif
/* Constants for SHA512 from FIPS 180-2:4.2.3.
* SHA256 constants from FIPS 180-2:4.2.2