diff options
Diffstat (limited to 'archival/bz/blocksort.c')
-rw-r--r-- | archival/bz/blocksort.c | 1128 |
1 files changed, 1128 insertions, 0 deletions
diff --git a/archival/bz/blocksort.c b/archival/bz/blocksort.c new file mode 100644 index 0000000..7d2856b --- /dev/null +++ b/archival/bz/blocksort.c @@ -0,0 +1,1128 @@ +/* + * bzip2 is written by Julian Seward <jseward@bzip.org>. + * Adapted for busybox by Denys Vlasenko <vda.linux@googlemail.com>. + * See README and LICENSE files in this directory for more information. + */ + +/*-------------------------------------------------------------*/ +/*--- Block sorting machinery ---*/ +/*--- blocksort.c ---*/ +/*-------------------------------------------------------------*/ + +/* ------------------------------------------------------------------ +This file is part of bzip2/libbzip2, a program and library for +lossless, block-sorting data compression. + +bzip2/libbzip2 version 1.0.4 of 20 December 2006 +Copyright (C) 1996-2006 Julian Seward <jseward@bzip.org> + +Please read the WARNING, DISCLAIMER and PATENTS sections in the +README file. + +This program is released under the terms of the license contained +in the file LICENSE. +------------------------------------------------------------------ */ + +/* #include "bzlib_private.h" */ + +/*---------------------------------------------*/ +/*--- Fallback O(N log(N)^2) sorting ---*/ +/*--- algorithm, for repetitive blocks ---*/ +/*---------------------------------------------*/ + +/*---------------------------------------------*/ +static +inline +void fallbackSimpleSort(uint32_t* fmap, + uint32_t* eclass, + int32_t lo, + int32_t hi) +{ + int32_t i, j, tmp; + uint32_t ec_tmp; + + if (lo == hi) return; + + if (hi - lo > 3) { + for (i = hi-4; i >= lo; i--) { + tmp = fmap[i]; + ec_tmp = eclass[tmp]; + for (j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4) + fmap[j-4] = fmap[j]; + fmap[j-4] = tmp; + } + } + + for (i = hi-1; i >= lo; i--) { + tmp = fmap[i]; + ec_tmp = eclass[tmp]; + for (j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++) + fmap[j-1] = fmap[j]; + fmap[j-1] = tmp; + } +} + + +/*---------------------------------------------*/ +#define fswap(zz1, zz2) \ +{ \ + int32_t zztmp = zz1; \ + zz1 = zz2; \ + zz2 = zztmp; \ +} + +#define fvswap(zzp1, zzp2, zzn) \ +{ \ + int32_t yyp1 = (zzp1); \ + int32_t yyp2 = (zzp2); \ + int32_t yyn = (zzn); \ + while (yyn > 0) { \ + fswap(fmap[yyp1], fmap[yyp2]); \ + yyp1++; \ + yyp2++; \ + yyn--; \ + } \ +} + + +#define fmin(a,b) ((a) < (b)) ? (a) : (b) + +#define fpush(lz,hz) { \ + stackLo[sp] = lz; \ + stackHi[sp] = hz; \ + sp++; \ +} + +#define fpop(lz,hz) { \ + sp--; \ + lz = stackLo[sp]; \ + hz = stackHi[sp]; \ +} + +#define FALLBACK_QSORT_SMALL_THRESH 10 +#define FALLBACK_QSORT_STACK_SIZE 100 + + +static +void fallbackQSort3(uint32_t* fmap, + uint32_t* eclass, + int32_t loSt, + int32_t hiSt) +{ + int32_t unLo, unHi, ltLo, gtHi, n, m; + int32_t sp, lo, hi; + uint32_t med, r, r3; + int32_t stackLo[FALLBACK_QSORT_STACK_SIZE]; + int32_t stackHi[FALLBACK_QSORT_STACK_SIZE]; + + r = 0; + + sp = 0; + fpush(loSt, hiSt); + + while (sp > 0) { + AssertH(sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004); + + fpop(lo, hi); + if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) { + fallbackSimpleSort(fmap, eclass, lo, hi); + continue; + } + + /* Random partitioning. Median of 3 sometimes fails to + * avoid bad cases. Median of 9 seems to help but + * looks rather expensive. This too seems to work but + * is cheaper. Guidance for the magic constants + * 7621 and 32768 is taken from Sedgewick's algorithms + * book, chapter 35. + */ + r = ((r * 7621) + 1) % 32768; + r3 = r % 3; + if (r3 == 0) + med = eclass[fmap[lo]]; + else if (r3 == 1) + med = eclass[fmap[(lo+hi)>>1]]; + else + med = eclass[fmap[hi]]; + + unLo = ltLo = lo; + unHi = gtHi = hi; + + while (1) { + while (1) { + if (unLo > unHi) break; + n = (int32_t)eclass[fmap[unLo]] - (int32_t)med; + if (n == 0) { + fswap(fmap[unLo], fmap[ltLo]); + ltLo++; + unLo++; + continue; + }; + if (n > 0) break; + unLo++; + } + while (1) { + if (unLo > unHi) break; + n = (int32_t)eclass[fmap[unHi]] - (int32_t)med; + if (n == 0) { + fswap(fmap[unHi], fmap[gtHi]); + gtHi--; unHi--; + continue; + }; + if (n < 0) break; + unHi--; + } + if (unLo > unHi) break; + fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--; + } + + AssertD(unHi == unLo-1, "fallbackQSort3(2)"); + + if (gtHi < ltLo) continue; + + n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n); + m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m); + + n = lo + unLo - ltLo - 1; + m = hi - (gtHi - unHi) + 1; + + if (n - lo > hi - m) { + fpush(lo, n); + fpush(m, hi); + } else { + fpush(m, hi); + fpush(lo, n); + } + } +} + +#undef fmin +#undef fpush +#undef fpop +#undef fswap +#undef fvswap +#undef FALLBACK_QSORT_SMALL_THRESH +#undef FALLBACK_QSORT_STACK_SIZE + + +/*---------------------------------------------*/ +/* Pre: + * nblock > 0 + * eclass exists for [0 .. nblock-1] + * ((UChar*)eclass) [0 .. nblock-1] holds block + * ptr exists for [0 .. nblock-1] + * + * Post: + * ((UChar*)eclass) [0 .. nblock-1] holds block + * All other areas of eclass destroyed + * fmap [0 .. nblock-1] holds sorted order + * bhtab[0 .. 2+(nblock/32)] destroyed +*/ + +#define SET_BH(zz) bhtab[(zz) >> 5] |= (1 << ((zz) & 31)) +#define CLEAR_BH(zz) bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31)) +#define ISSET_BH(zz) (bhtab[(zz) >> 5] & (1 << ((zz) & 31))) +#define WORD_BH(zz) bhtab[(zz) >> 5] +#define UNALIGNED_BH(zz) ((zz) & 0x01f) + +static +void fallbackSort(uint32_t* fmap, + uint32_t* eclass, + uint32_t* bhtab, + int32_t nblock) +{ + int32_t ftab[257]; + int32_t ftabCopy[256]; + int32_t H, i, j, k, l, r, cc, cc1; + int32_t nNotDone; + int32_t nBhtab; + UChar* eclass8 = (UChar*)eclass; + + /* + * Initial 1-char radix sort to generate + * initial fmap and initial BH bits. + */ + for (i = 0; i < 257; i++) ftab[i] = 0; + for (i = 0; i < nblock; i++) ftab[eclass8[i]]++; + for (i = 0; i < 256; i++) ftabCopy[i] = ftab[i]; + for (i = 1; i < 257; i++) ftab[i] += ftab[i-1]; + + for (i = 0; i < nblock; i++) { + j = eclass8[i]; + k = ftab[j] - 1; + ftab[j] = k; + fmap[k] = i; + } + + nBhtab = 2 + (nblock / 32); + for (i = 0; i < nBhtab; i++) bhtab[i] = 0; + for (i = 0; i < 256; i++) SET_BH(ftab[i]); + + /* + * Inductively refine the buckets. Kind-of an + * "exponential radix sort" (!), inspired by the + * Manber-Myers suffix array construction algorithm. + */ + + /*-- set sentinel bits for block-end detection --*/ + for (i = 0; i < 32; i++) { + SET_BH(nblock + 2*i); + CLEAR_BH(nblock + 2*i + 1); + } + + /*-- the log(N) loop --*/ + H = 1; + while (1) { + j = 0; + for (i = 0; i < nblock; i++) { + if (ISSET_BH(i)) + j = i; + k = fmap[i] - H; + if (k < 0) + k += nblock; + eclass[k] = j; + } + + nNotDone = 0; + r = -1; + while (1) { + + /*-- find the next non-singleton bucket --*/ + k = r + 1; + while (ISSET_BH(k) && UNALIGNED_BH(k)) + k++; + if (ISSET_BH(k)) { + while (WORD_BH(k) == 0xffffffff) k += 32; + while (ISSET_BH(k)) k++; + } + l = k - 1; + if (l >= nblock) + break; + while (!ISSET_BH(k) && UNALIGNED_BH(k)) + k++; + if (!ISSET_BH(k)) { + while (WORD_BH(k) == 0x00000000) k += 32; + while (!ISSET_BH(k)) k++; + } + r = k - 1; + if (r >= nblock) + break; + + /*-- now [l, r] bracket current bucket --*/ + if (r > l) { + nNotDone += (r - l + 1); + fallbackQSort3(fmap, eclass, l, r); + + /*-- scan bucket and generate header bits-- */ + cc = -1; + for (i = l; i <= r; i++) { + cc1 = eclass[fmap[i]]; + if (cc != cc1) { + SET_BH(i); + cc = cc1; + }; + } + } + } + + H *= 2; + if (H > nblock || nNotDone == 0) + break; + } + + /* + * Reconstruct the original block in + * eclass8 [0 .. nblock-1], since the + * previous phase destroyed it. + */ + j = 0; + for (i = 0; i < nblock; i++) { + while (ftabCopy[j] == 0) + j++; + ftabCopy[j]--; + eclass8[fmap[i]] = (UChar)j; + } + AssertH(j < 256, 1005); +} + +#undef SET_BH +#undef CLEAR_BH +#undef ISSET_BH +#undef WORD_BH +#undef UNALIGNED_BH + + +/*---------------------------------------------*/ +/*--- The main, O(N^2 log(N)) sorting ---*/ +/*--- algorithm. Faster for "normal" ---*/ +/*--- non-repetitive blocks. ---*/ +/*---------------------------------------------*/ + +/*---------------------------------------------*/ +static +inline +Bool mainGtU( + uint32_t i1, + uint32_t i2, + UChar* block, + uint16_t* quadrant, + uint32_t nblock, + int32_t* budget) +{ + int32_t k; + UChar c1, c2; + uint16_t s1, s2; + +///unrolling + AssertD(i1 != i2, "mainGtU"); + /* 1 */ + c1 = block[i1]; c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + i1++; i2++; + /* 2 */ + c1 = block[i1]; c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + i1++; i2++; + /* 3 */ + c1 = block[i1]; c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + i1++; i2++; + /* 4 */ + c1 = block[i1]; c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + i1++; i2++; + /* 5 */ + c1 = block[i1]; c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + i1++; i2++; + /* 6 */ + c1 = block[i1]; c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + i1++; i2++; + /* 7 */ + c1 = block[i1]; c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + i1++; i2++; + /* 8 */ + c1 = block[i1]; c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + i1++; i2++; + /* 9 */ + c1 = block[i1]; c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + i1++; i2++; + /* 10 */ + c1 = block[i1]; c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + i1++; i2++; + /* 11 */ + c1 = block[i1]; c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + i1++; i2++; + /* 12 */ + c1 = block[i1]; c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + i1++; i2++; + + k = nblock + 8; + +///unrolling + do { + /* 1 */ + c1 = block[i1]; c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + s1 = quadrant[i1]; s2 = quadrant[i2]; + if (s1 != s2) return (s1 > s2); + i1++; i2++; + /* 2 */ + c1 = block[i1]; c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + s1 = quadrant[i1]; s2 = quadrant[i2]; + if (s1 != s2) return (s1 > s2); + i1++; i2++; + /* 3 */ + c1 = block[i1]; c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + s1 = quadrant[i1]; s2 = quadrant[i2]; + if (s1 != s2) return (s1 > s2); + i1++; i2++; + /* 4 */ + c1 = block[i1]; c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + s1 = quadrant[i1]; s2 = quadrant[i2]; + if (s1 != s2) return (s1 > s2); + i1++; i2++; + /* 5 */ + c1 = block[i1]; c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + s1 = quadrant[i1]; s2 = quadrant[i2]; + if (s1 != s2) return (s1 > s2); + i1++; i2++; + /* 6 */ + c1 = block[i1]; c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + s1 = quadrant[i1]; s2 = quadrant[i2]; + if (s1 != s2) return (s1 > s2); + i1++; i2++; + /* 7 */ + c1 = block[i1]; c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + s1 = quadrant[i1]; s2 = quadrant[i2]; + if (s1 != s2) return (s1 > s2); + i1++; i2++; + /* 8 */ + c1 = block[i1]; c2 = block[i2]; + if (c1 != c2) return (c1 > c2); + s1 = quadrant[i1]; s2 = quadrant[i2]; + if (s1 != s2) return (s1 > s2); + i1++; i2++; + + if (i1 >= nblock) i1 -= nblock; + if (i2 >= nblock) i2 -= nblock; + + k -= 8; + (*budget)--; + } while (k >= 0); + + return False; +} + + +/*---------------------------------------------*/ +/* + * Knuth's increments seem to work better + * than Incerpi-Sedgewick here. Possibly + * because the number of elems to sort is + * usually small, typically <= 20. + */ +static +const int32_t incs[14] = { + 1, 4, 13, 40, 121, 364, 1093, 3280, + 9841, 29524, 88573, 265720, + 797161, 2391484 +}; + +static +void mainSimpleSort(uint32_t* ptr, + UChar* block, + uint16_t* quadrant, + int32_t nblock, + int32_t lo, + int32_t hi, + int32_t d, + int32_t* budget) +{ + int32_t i, j, h, bigN, hp; + uint32_t v; + + bigN = hi - lo + 1; + if (bigN < 2) return; + + hp = 0; + while (incs[hp] < bigN) hp++; + hp--; + + for (; hp >= 0; hp--) { + h = incs[hp]; + + i = lo + h; + while (1) { + +///unrolling + /*-- copy 1 --*/ + if (i > hi) break; + v = ptr[i]; + j = i; + while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) { + ptr[j] = ptr[j-h]; + j = j - h; + if (j <= (lo + h - 1)) break; + } + ptr[j] = v; + i++; + + /*-- copy 2 --*/ + if (i > hi) break; + v = ptr[i]; + j = i; + while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) { + ptr[j] = ptr[j-h]; + j = j - h; + if (j <= (lo + h - 1)) break; + } + ptr[j] = v; + i++; + + /*-- copy 3 --*/ + if (i > hi) break; + v = ptr[i]; + j = i; + while (mainGtU (ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) { + ptr[j] = ptr[j-h]; + j = j - h; + if (j <= (lo + h - 1)) break; + } + ptr[j] = v; + i++; + + if (*budget < 0) return; + } + } +} + + +/*---------------------------------------------*/ +/* + * The following is an implementation of + * an elegant 3-way quicksort for strings, + * described in a paper "Fast Algorithms for + * Sorting and Searching Strings", by Robert + * Sedgewick and Jon L. Bentley. + */ + +#define mswap(zz1, zz2) \ +{ \ + int32_t zztmp = zz1; \ + zz1 = zz2; \ + zz2 = zztmp; \ +} + +#define mvswap(zzp1, zzp2, zzn) \ +{ \ + int32_t yyp1 = (zzp1); \ + int32_t yyp2 = (zzp2); \ + int32_t yyn = (zzn); \ + while (yyn > 0) { \ + mswap(ptr[yyp1], ptr[yyp2]); \ + yyp1++; \ + yyp2++; \ + yyn--; \ + } \ +} + +static +inline +UChar mmed3(UChar a, UChar b, UChar c) +{ + UChar t; + if (a > b) { + t = a; + a = b; + b = t; + }; + if (b > c) { + b = c; + if (a > b) + b = a; + } + return b; +} + +#define mmin(a,b) ((a) < (b)) ? (a) : (b) + +#define mpush(lz,hz,dz) \ +{ \ + stackLo[sp] = lz; \ + stackHi[sp] = hz; \ + stackD [sp] = dz; \ + sp++; \ +} + +#define mpop(lz,hz,dz) \ +{ \ + sp--; \ + lz = stackLo[sp]; \ + hz = stackHi[sp]; \ + dz = stackD [sp]; \ +} + + +#define mnextsize(az) (nextHi[az]-nextLo[az]) + +#define mnextswap(az,bz) \ +{ \ + int32_t tz; \ + tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \ + tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \ + tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; \ +} + +#define MAIN_QSORT_SMALL_THRESH 20 +#define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT) +#define MAIN_QSORT_STACK_SIZE 100 + +static +void mainQSort3(uint32_t* ptr, + UChar* block, + uint16_t* quadrant, + int32_t nblock, + int32_t loSt, + int32_t hiSt, + int32_t dSt, + int32_t* budget) +{ + int32_t unLo, unHi, ltLo, gtHi, n, m, med; + int32_t sp, lo, hi, d; + + int32_t stackLo[MAIN_QSORT_STACK_SIZE]; + int32_t stackHi[MAIN_QSORT_STACK_SIZE]; + int32_t stackD [MAIN_QSORT_STACK_SIZE]; + + int32_t nextLo[3]; + int32_t nextHi[3]; + int32_t nextD [3]; + + sp = 0; + mpush(loSt, hiSt, dSt); + + while (sp > 0) { + AssertH(sp < MAIN_QSORT_STACK_SIZE - 2, 1001); + + mpop(lo, hi, d); + if (hi - lo < MAIN_QSORT_SMALL_THRESH + || d > MAIN_QSORT_DEPTH_THRESH + ) { + mainSimpleSort(ptr, block, quadrant, nblock, lo, hi, d, budget); + if (*budget < 0) + return; + continue; + } + + med = (int32_t) mmed3(block[ptr[lo ] + d], + block[ptr[hi ] + d], + block[ptr[(lo+hi) >> 1] + d]); + + unLo = ltLo = lo; + unHi = gtHi = hi; + + while (1) { + while (1) { + if (unLo > unHi) + break; + n = ((int32_t)block[ptr[unLo]+d]) - med; + if (n == 0) { + mswap(ptr[unLo], ptr[ltLo]); + ltLo++; + unLo++; + continue; + }; + if (n > 0) break; + unLo++; + } + while (1) { + if (unLo > unHi) + break; + n = ((int32_t)block[ptr[unHi]+d]) - med; + if (n == 0) { + mswap(ptr[unHi], ptr[gtHi]); + gtHi--; + unHi--; + continue; + }; + if (n < 0) break; + unHi--; + } + if (unLo > unHi) + break; + mswap(ptr[unLo], ptr[unHi]); + unLo++; + unHi--; + } + + AssertD(unHi == unLo-1, "mainQSort3(2)"); + + if (gtHi < ltLo) { + mpush(lo, hi, d + 1); + continue; + } + + n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n); + m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m); + + n = lo + unLo - ltLo - 1; + m = hi - (gtHi - unHi) + 1; + + nextLo[0] = lo; nextHi[0] = n; nextD[0] = d; + nextLo[1] = m; nextHi[1] = hi; nextD[1] = d; + nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1; + + if (mnextsize(0) < mnextsize(1)) mnextswap(0,1); + if (mnextsize(1) < mnextsize(2)) mnextswap(1,2); + if (mnextsize(0) < mnextsize(1)) mnextswap(0,1); + + AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)"); + AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)"); + + mpush (nextLo[0], nextHi[0], nextD[0]); + mpush (nextLo[1], nextHi[1], nextD[1]); + mpush (nextLo[2], nextHi[2], nextD[2]); + } +} + +#undef mswap +#undef mvswap +#undef mpush +#undef mpop +#undef mmin +#undef mnextsize +#undef mnextswap +#undef MAIN_QSORT_SMALL_THRESH +#undef MAIN_QSORT_DEPTH_THRESH +#undef MAIN_QSORT_STACK_SIZE + + +/*---------------------------------------------*/ +/* Pre: + * nblock > N_OVERSHOOT + * block32 exists for [0 .. nblock-1 +N_OVERSHOOT] + * ((UChar*)block32) [0 .. nblock-1] holds block + * ptr exists for [0 .. nblock-1] + * + * Post: + * ((UChar*)block32) [0 .. nblock-1] holds block + * All other areas of block32 destroyed + * ftab[0 .. 65536] destroyed + * ptr [0 .. nblock-1] holds sorted order + * if (*budget < 0), sorting was abandoned + */ + +#define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8]) +#define SETMASK (1 << 21) +#define CLEARMASK (~(SETMASK)) + +static NOINLINE +void mainSort(uint32_t* ptr, + UChar* block, + uint16_t* quadrant, + uint32_t* ftab, + int32_t nblock, + int32_t* budget) +{ + int32_t i, j, k, ss, sb; + int32_t runningOrder[256]; + Bool bigDone[256]; + int32_t copyStart[256]; + int32_t copyEnd [256]; + UChar c1; + int32_t numQSorted; + uint16_t s; + + /*-- set up the 2-byte frequency table --*/ + /* was: for (i = 65536; i >= 0; i--) ftab[i] = 0; */ + memset(ftab, 0, 65537 * sizeof(ftab[0])); + + j = block[0] << 8; + i = nblock-1; +#if 0 + for (; i >= 3; i -= 4) { + quadrant[i] = 0; + j = (j >> 8) |(((uint16_t)block[i]) << 8); + ftab[j]++; + quadrant[i-1] = 0; + j = (j >> 8) |(((uint16_t)block[i-1]) << 8); + ftab[j]++; + quadrant[i-2] = 0; + j = (j >> 8) |(((uint16_t)block[i-2]) << 8); + ftab[j]++; + quadrant[i-3] = 0; + j = (j >> 8) |(((uint16_t)block[i-3]) << 8); + ftab[j]++; + } +#endif + for (; i >= 0; i--) { + quadrant[i] = 0; + j = (j >> 8) |(((uint16_t)block[i]) << 8); + ftab[j]++; + } + + /*-- (emphasises close relationship of block & quadrant) --*/ + for (i = 0; i < BZ_N_OVERSHOOT; i++) { + block [nblock+i] = block[i]; + quadrant[nblock+i] = 0; + } + + /*-- Complete the initial radix sort --*/ + for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1]; + + s = block[0] << 8; + i = nblock-1; +#if 0 + for (; i >= 3; i -= 4) { + s = (s >> 8) | (block[i] << 8); + j = ftab[s] -1; + ftab[s] = j; + ptr[j] = i; + s = (s >> 8) | (block[i-1] << 8); + j = ftab[s] -1; + ftab[s] = j; + ptr[j] = i-1; + s = (s >> 8) | (block[i-2] << 8); + j = ftab[s] -1; + ftab[s] = j; + ptr[j] = i-2; + s = (s >> 8) | (block[i-3] << 8); + j = ftab[s] -1; + ftab[s] = j; + ptr[j] = i-3; + } +#endif + for (; i >= 0; i--) { + s = (s >> 8) | (block[i] << 8); + j = ftab[s] -1; + ftab[s] = j; + ptr[j] = i; + } + + /* + * Now ftab contains the first loc of every small bucket. + * Calculate the running order, from smallest to largest + * big bucket. + */ + for (i = 0; i <= 255; i++) { + bigDone [i] = False; + runningOrder[i] = i; + } + + { + int32_t vv; + /* was: int32_t h = 1; */ + /* do h = 3 * h + 1; while (h <= 256); */ + int32_t h = 364; + + do { + h = h / 3; + for (i = h; i <= 255; i++) { + vv = runningOrder[i]; + j = i; + while (BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv)) { + runningOrder[j] = runningOrder[j-h]; + j = j - h; + if (j <= (h - 1)) goto zero; + } + zero: + runningOrder[j] = vv; + } + } while (h != 1); + } + + /* + * The main sorting loop. + */ + + numQSorted = 0; + + for (i = 0; i <= 255; i++) { + + /* + * Process big buckets, starting with the least full. + * Basically this is a 3-step process in which we call + * mainQSort3 to sort the small buckets [ss, j], but + * also make a big effort to avoid the calls if we can. + */ + ss = runningOrder[i]; + + /* + * Step 1: + * Complete the big bucket [ss] by quicksorting + * any unsorted small buckets [ss, j], for j != ss. + * Hopefully previous pointer-scanning phases have already + * completed many of the small buckets [ss, j], so + * we don't have to sort them at all. + */ + for (j = 0; j <= 255; j++) { + if (j != ss) { + sb = (ss << 8) + j; + if (!(ftab[sb] & SETMASK)) { + int32_t lo = ftab[sb] & CLEARMASK; + int32_t hi = (ftab[sb+1] & CLEARMASK) - 1; + if (hi > lo) { + mainQSort3 ( + ptr, block, quadrant, nblock, + lo, hi, BZ_N_RADIX, budget + ); + numQSorted += (hi - lo + 1); + if (*budget < 0) return; + } + } + ftab[sb] |= SETMASK; + } + } + + AssertH(!bigDone[ss], 1006); + + /* + * Step 2: + * Now scan this big bucket [ss] so as to synthesise the + * sorted order for small buckets [t, ss] for all t, + * including, magically, the bucket [ss,ss] too. + * This will avoid doing Real Work in subsequent Step 1's. + */ + { + for (j = 0; j <= 255; j++) { + copyStart[j] = ftab[(j << 8) + ss] & CLEARMASK; + copyEnd [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1; + } + for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) { + k = ptr[j] - 1; + if (k < 0) + k += nblock; + c1 = block[k]; + if (!bigDone[c1]) + ptr[copyStart[c1]++] = k; + } + for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) { + k = ptr[j]-1; + if (k < 0) + k += nblock; + c1 = block[k]; + if (!bigDone[c1]) + ptr[copyEnd[c1]--] = k; + } + } + + AssertH((copyStart[ss]-1 == copyEnd[ss]) + || + /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1. + * Necessity for this case is demonstrated by compressing + * a sequence of approximately 48.5 million of character + * 251; 1.0.0/1.0.1 will then die here. */ + (copyStart[ss] == 0 && copyEnd[ss] == nblock-1), + 1007) + + for (j = 0; j <= 255; j++) + ftab[(j << 8) + ss] |= SETMASK; + + /* + * Step 3: + * The [ss] big bucket is now done. Record this fact, + * and update the quadrant descriptors. Remember to + * update quadrants in the overshoot area too, if + * necessary. The "if (i < 255)" test merely skips + * this updating for the last bucket processed, since + * updating for the last bucket is pointless. + * + * The quadrant array provides a way to incrementally + * cache sort orderings, as they appear, so as to + * make subsequent comparisons in fullGtU() complete + * faster. For repetitive blocks this makes a big + * difference (but not big enough to be able to avoid + * the fallback sorting mechanism, exponential radix sort). + * + * The precise meaning is: at all times: + * + * for 0 <= i < nblock and 0 <= j <= nblock + * + * if block[i] != block[j], + * + * then the relative values of quadrant[i] and + * quadrant[j] are meaningless. + * + * else { + * if quadrant[i] < quadrant[j] + * then the string starting at i lexicographically + * precedes the string starting at j + * + * else if quadrant[i] > quadrant[j] + * then the string starting at j lexicographically + * precedes the string starting at i + * + * else + * the relative ordering of the strings starting + * at i and j has not yet been determined. + * } + */ + bigDone[ss] = True; + + if (i < 255) { + int32_t bbStart = ftab[ss << 8] & CLEARMASK; + int32_t bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart; + int32_t shifts = 0; + + while ((bbSize >> shifts) > 65534) shifts++; + + for (j = bbSize-1; j >= 0; j--) { + int32_t a2update = ptr[bbStart + j]; + uint16_t qVal = (uint16_t)(j >> shifts); + quadrant[a2update] = qVal; + if (a2update < BZ_N_OVERSHOOT) + quadrant[a2update + nblock] = qVal; + } + AssertH(((bbSize-1) >> shifts) <= 65535, 1002); + } + + } +} + +#undef BIGFREQ +#undef SETMASK +#undef CLEARMASK + + +/*---------------------------------------------*/ +/* Pre: + * nblock > 0 + * arr2 exists for [0 .. nblock-1 +N_OVERSHOOT] + * ((UChar*)arr2)[0 .. nblock-1] holds block + * arr1 exists for [0 .. nblock-1] + * + * Post: + * ((UChar*)arr2) [0 .. nblock-1] holds block + * All other areas of block destroyed + * ftab[0 .. 65536] destroyed + * arr1[0 .. nblock-1] holds sorted order + */ +static NOINLINE +void BZ2_blockSort(EState* s) +{ + /* In original bzip2 1.0.4, it's a parameter, but 30 + * should work ok. */ + enum { wfact = 30 }; + + uint32_t* ptr = s->ptr; + UChar* block = s->block; + uint32_t* ftab = s->ftab; + int32_t nblock = s->nblock; + uint16_t* quadrant; + int32_t budget; + int32_t i; + + if (nblock < 10000) { + fallbackSort(s->arr1, s->arr2, ftab, nblock); + } else { + /* Calculate the location for quadrant, remembering to get + * the alignment right. Assumes that &(block[0]) is at least + * 2-byte aligned -- this should be ok since block is really + * the first section of arr2. + */ + i = nblock + BZ_N_OVERSHOOT; + if (i & 1) i++; + quadrant = (uint16_t*)(&(block[i])); + + /* (wfact-1) / 3 puts the default-factor-30 + * transition point at very roughly the same place as + * with v0.1 and v0.9.0. + * Not that it particularly matters any more, since the + * resulting compressed stream is now the same regardless + * of whether or not we use the main sort or fallback sort. + */ + budget = nblock * ((wfact-1) / 3); + + mainSort(ptr, block, quadrant, ftab, nblock, &budget); + if (budget < 0) { + fallbackSort(s->arr1, s->arr2, ftab, nblock); + } + } + + s->origPtr = -1; + for (i = 0; i < s->nblock; i++) + if (ptr[i] == 0) { + s->origPtr = i; break; + }; + + AssertH(s->origPtr != -1, 1003); +} + + +/*-------------------------------------------------------------*/ +/*--- end blocksort.c ---*/ +/*-------------------------------------------------------------*/ |