From f079f913711091c87557a4c7854385c408e4ab7c Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 20 Dec 2020 21:37:29 +0100 Subject: factor: 30% faster trial division (better sieve) function old new delta packed_wheel - 192 +192 factor_main 108 176 +68 factorize 284 218 -66 ------------------------------------------------------------------------------ (add/remove: 1/0 grow/shrink: 1/1 up/down: 260/-66) Total: 194 bytes Signed-off-by: Denys Vlasenko --- coreutils/factor.c | 170 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 95 insertions(+), 75 deletions(-) diff --git a/coreutils/factor.c b/coreutils/factor.c index 1f24784..30498c7 100644 --- a/coreutils/factor.c +++ b/coreutils/factor.c @@ -19,6 +19,7 @@ //usage: "Print prime factors" #include "libbb.h" +#include "common_bufsiz.h" #if 0 # define dbg(...) bb_error_msg(__VA_ARGS__) @@ -42,6 +43,83 @@ typedef unsigned long half_t; #error Cant find an integer type which is half as wide as ullong #endif +/* The trial divisor increment wheel. Use it to skip over divisors that + * are composites of 2, 3, 5, 7, or 11. + * Larger wheels improve sieving only slightly, but quickly grow in size + * (adding just one prime, 13, results in 5766 element sieve). + */ +#define R(a,b,c,d,e,f,g,h,i,j,A,B,C,D,E,F,G,H,I,J) \ + (((uint64_t)(a<<0) | (b<<3) | (c<<6) | (d<<9) | (e<<12) | (f<<15) | (g<<18) | (h<<21) | (i<<24) | (j<<27)) ) | \ + (((uint64_t)(A<<0) | (B<<3) | (C<<6) | (D<<9) | (E<<12) | (F<<15) | (G<<18) | (H<<21) | (I<<24) | (J<<27)) << 30) | \ + (((uint64_t)7 << 60)) +#define P(a,b,c,d,e,f,g,h,i,j,A,B,C,D,E,F,G,H,I,J) \ + R( (a/2-1),(b/2-1),(c/2-1),(d/2-1),(e/2-1),(f/2-1),(g/2-1),(h/2-1),(i/2-1),(j/2-1), \ + (A/2-1),(B/2-1),(C/2-1),(D/2-1),(E/2-1),(F/2-1),(G/2-1),(H/2-1),(I/2-1),(J/2-1) ) +static const uint64_t packed_wheel[] = { + /*1, 2, 2, 4, 2,*/ + P( 4, 2, 4, 6, 2, 6, 4, 2, 4, 6, 6, 2, 6, 4, 2, 6, 4, 6, 8, 4), //01 + P( 2, 4, 2, 4,14, 4, 6, 2,10, 2, 6, 6, 4, 2, 4, 6, 2,10, 2, 4), //02 + P( 2,12,10, 2, 4, 2, 4, 6, 2, 6, 4, 6, 6, 6, 2, 6, 4, 2, 6, 4), //03 + P( 6, 8, 4, 2, 4, 6, 8, 6,10, 2, 4, 6, 2, 6, 6, 4, 2, 4, 6, 2), //04 + P( 6, 4, 2, 6,10, 2,10, 2, 4, 2, 4, 6, 8, 4, 2, 4,12, 2, 6, 4), //05 + P( 2, 6, 4, 6,12, 2, 4, 2, 4, 8, 6, 4, 6, 2, 4, 6, 2, 6,10, 2), //06 + P( 4, 6, 2, 6, 4, 2, 4, 2,10, 2,10, 2, 4, 6, 6, 2, 6, 6, 4, 6), //07 + P( 6, 2, 6, 4, 2, 6, 4, 6, 8, 4, 2, 6, 4, 8, 6, 4, 6, 2, 4, 6), //08 + P( 8, 6, 4, 2,10, 2, 6, 4, 2, 4, 2,10, 2,10, 2, 4, 2, 4, 8, 6), //09 + P( 4, 2, 4, 6, 6, 2, 6, 4, 8, 4, 6, 8, 4, 2, 4, 2, 4, 8, 6, 4), //10 + P( 6, 6, 6, 2, 6, 6, 4, 2, 4, 6, 2, 6, 4, 2, 4, 2,10, 2,10, 2), //11 + P( 6, 4, 6, 2, 6, 4, 2, 4, 6, 6, 8, 4, 2, 6,10, 8, 4, 2, 4, 2), //12 + P( 4, 8,10, 6, 2, 4, 8, 6, 6, 4, 2, 4, 6, 2, 6, 4, 6, 2,10, 2), //13 + P(10, 2, 4, 2, 4, 6, 2, 6, 4, 2, 4, 6, 6, 2, 6, 6, 6, 4, 6, 8), //14 + P( 4, 2, 4, 2, 4, 8, 6, 4, 8, 4, 6, 2, 6, 6, 4, 2, 4, 6, 8, 4), //15 + P( 2, 4, 2,10, 2,10, 2, 4, 2, 4, 6, 2,10, 2, 4, 6, 8, 6, 4, 2), //16 + P( 6, 4, 6, 8, 4, 6, 2, 4, 8, 6, 4, 6, 2, 4, 6, 2, 6, 6, 4, 6), //17 + P( 6, 2, 6, 6, 4, 2,10, 2,10, 2, 4, 2, 4, 6, 2, 6, 4, 2,10, 6), //18 + P( 2, 6, 4, 2, 6, 4, 6, 8, 4, 2, 4, 2,12, 6, 4, 6, 2, 4, 6, 2), //19 + P(12, 4, 2, 4, 8, 6, 4, 2, 4, 2,10, 2,10, 6, 2, 4, 6, 2, 6, 4), //20 + P( 2, 4, 6, 6, 2, 6, 4, 2,10, 6, 8, 6, 4, 2, 4, 8, 6, 4, 6, 2), //21 + P( 4, 6, 2, 6, 6, 6, 4, 6, 2, 6, 4, 2, 4, 2,10,12, 2, 4, 2,10), //22 + P( 2, 6, 4, 2, 4, 6, 6, 2,10, 2, 6, 4,14, 4, 2, 4, 2, 4, 8, 6), //23 + P( 4, 6, 2, 4, 6, 2, 6, 6, 4, 2, 4, 6, 2, 6, 4, 2, 4,12, 2,12), //24 +}; +#undef P +#undef r +#define WHEEL_START 5 +#define WHEEL_SIZE (5 + 24 * 20) +#define wheel_tab ((uint8_t*)&bb_common_bufsiz1) +/* + * Why, you ask? + * plain byte array: + * function old new delta + * wheel_tab - 485 +485 + * 3-bit-packed insanity: + * packed_wheel - 192 +192 + * factor_main 108 176 +68 + */ +static void unpack_wheel(void) +{ + int i; + uint8_t *p; + + setup_common_bufsiz(); + wheel_tab[0] = 1; + wheel_tab[1] = 2; + wheel_tab[2] = 2; + wheel_tab[3] = 4; + wheel_tab[4] = 2; + p = &wheel_tab[5]; + for (i = 0; i < ARRAY_SIZE(packed_wheel); i++) { + uint64_t v = packed_wheel[i]; + do { + *p = ((unsigned)(v & 7) + 1) * 2; + //printf("%2u,", *p); + p++; + v >>= 3; + } while ((unsigned)v != 7); + //printf("\n"); + } +} + static half_t isqrt_odd(wide_t N) { half_t s = isqrt(N); @@ -53,43 +131,20 @@ static half_t isqrt_odd(wide_t N) static NOINLINE void factorize(wide_t N) { + unsigned w; half_t factor; half_t max_factor; - // unsigned count3; - // unsigned count5; - // unsigned count7; - // ^^^^^^^^^^^^^^^ commented-out simple sieving code (easier to grasp). - // Faster sieving, using one word for potentially up to 6 counters: - // count upwards in each mask, counter "triggers" when it sets its mask to "100[0]..." - // 10987654321098765432109876543210 - bits 31-0 in 32-bit word - // 17777713333311111777775555333 - bit masks for counters for primes 3,5,7,11,13,17 - // 100000100001000010001001 - value for adding 1 to each mask - // 10000010000010000100001000100 - value for checking that any mask reached msb - enum { - SHIFT_3 = 1 << 0, - SHIFT_5 = 1 << 3, - SHIFT_7 = 1 << 7, - INCREMENT_EACH = SHIFT_3 | SHIFT_5 | SHIFT_7, - MULTIPLE_OF_3 = 1 << 2, - MULTIPLE_OF_5 = 1 << 6, - MULTIPLE_OF_7 = 1 << 11, - MULTIPLE_DETECTED = MULTIPLE_OF_3 | MULTIPLE_OF_5 | MULTIPLE_OF_7, - }; - unsigned sieve_word; if (N < 4) goto end; - while (!(N & 1)) { - printf(" 2"); - N >>= 1; - } - /* The code needs to be optimized for the case where * there are large prime factors. For example, * this is not hard: * 8262075252869367027 = 3 7 17 23 47 101 113 127 131 137 823 - * (the largest factor to test is only ~sqrt(823) = 28) + * (the largest divisor to test for largest factor 823 + * is only ~sqrt(823) = 28, the entire factorization needs + * only ~33 trial divisions) * but this is: * 18446744073709551601 = 53 348051774975651917 * the last factor requires testing up to @@ -98,20 +153,11 @@ static NOINLINE void factorize(wide_t N) * factor 18446744073709551557 (0xffffffffffffffc5). */ max_factor = isqrt_odd(N); - // count3 = 3; - // count5 = 6; - // count7 = 9; - sieve_word = 0 - /* initial count for SHIFT_n is (n-1)/2*3: */ - + (MULTIPLE_OF_3 - 3 * SHIFT_3) - + (MULTIPLE_OF_5 - 6 * SHIFT_5) - + (MULTIPLE_OF_7 - 9 * SHIFT_7) - //+ (MULTIPLE_OF_11 - 15 * SHIFT_11) - //+ (MULTIPLE_OF_13 - 18 * SHIFT_13) - //+ (MULTIPLE_OF_17 - 24 * SHIFT_17) - ; - factor = 3; + factor = 2; + w = 0; for (;;) { + half_t fw; + /* The division is the most costly part of the loop. * On 64bit CPUs, takes at best 12 cycles, often ~20. */ @@ -120,44 +166,16 @@ static NOINLINE void factorize(wide_t N) printf(" %"HALF_FMT"u", factor); max_factor = isqrt_odd(N); } - next_factor: if (factor >= max_factor) break; - factor += 2; - /* Rudimentary wheel sieving: skip multiples of 3, 5 and 7: - * Every third odd number is divisible by three and thus isn't a prime: - * 5 7 9 11 13 15 17 19 21 23 25 27 29 31 33 35 37 39 41 43 45 47... - * ^ ^ ^ ^ ^ ^ ^ _ ^ ^ _ ^ ^ ^ ^ - * (^ = primes, _ = would-be-primes-if-not-divisible-by-5) - * The numbers with space under them are excluded by sieve 3. - */ - // count7--; - // count5--; - // count3--; - // if (count3 && count5 && count7) - // continue; - sieve_word += INCREMENT_EACH; - if (!(sieve_word & MULTIPLE_DETECTED)) + fw = factor + wheel_tab[w]; + if (fw < factor) + break; /* overflow */ + factor = fw; + w++; + if (w < WHEEL_SIZE) continue; - /* - * "factor" is multiple of 3 33% of the time (count3 reached 0), - * else, multiple of 5 13% of the time, - * else, multiple of 7 7.6% of the time. - * Cumulatively, with 3,5,7 sieving we are here 54.3% of the time. - */ - // if (count3 == 0) - // count3 = 3; - if (sieve_word & MULTIPLE_OF_3) - sieve_word -= SHIFT_3 * 3; - // if (count5 == 0) - // count5 = 5; - if (sieve_word & MULTIPLE_OF_5) - sieve_word -= SHIFT_5 * 5; - // if (count7 == 0) - // count7 = 7; - if (sieve_word & MULTIPLE_OF_7) - sieve_word -= SHIFT_7 * 7; - goto next_factor; + w = WHEEL_START; } end: if (N > 1) @@ -182,6 +200,8 @@ static void factorize_numstr(const char *numstr) int factor_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; int factor_main(int argc UNUSED_PARAM, char **argv) { + unpack_wheel(); + //// coreutils has undocumented option ---debug (three dashes) //getopt32(argv, ""); //argv += optind; -- cgit v1.1