summaryrefslogtreecommitdiff
path: root/archival/bz/blocksort.c
diff options
context:
space:
mode:
authorDenis Vlasenko2007-10-13 03:36:03 +0000
committerDenis Vlasenko2007-10-13 03:36:03 +0000
commit77f1ec1b9bf100e6c10aa0856c7156e321511785 (patch)
treef20e5a9062ecad82a43bde81e3041a19c4292733 /archival/bz/blocksort.c
parent11c23d7b990eae27357e5a41a97d62b9a214f7db (diff)
downloadbusybox-77f1ec1b9bf100e6c10aa0856c7156e321511785.zip
busybox-77f1ec1b9bf100e6c10aa0856c7156e321511785.tar.gz
bzip2: port bzip2 1.0.4 to busybox. note: bzip2 code resides
in separate directory (archival/bz/*) and is covered by BSD-style license. code size: 13k
Diffstat (limited to 'archival/bz/blocksort.c')
-rw-r--r--archival/bz/blocksort.c1128
1 files changed, 1128 insertions, 0 deletions
diff --git a/archival/bz/blocksort.c b/archival/bz/blocksort.c
new file mode 100644
index 0000000..7d2856b
--- /dev/null
+++ b/archival/bz/blocksort.c
@@ -0,0 +1,1128 @@
+/*
+ * bzip2 is written by Julian Seward <jseward@bzip.org>.
+ * Adapted for busybox by Denys Vlasenko <vda.linux@googlemail.com>.
+ * See README and LICENSE files in this directory for more information.
+ */
+
+/*-------------------------------------------------------------*/
+/*--- Block sorting machinery ---*/
+/*--- blocksort.c ---*/
+/*-------------------------------------------------------------*/
+
+/* ------------------------------------------------------------------
+This file is part of bzip2/libbzip2, a program and library for
+lossless, block-sorting data compression.
+
+bzip2/libbzip2 version 1.0.4 of 20 December 2006
+Copyright (C) 1996-2006 Julian Seward <jseward@bzip.org>
+
+Please read the WARNING, DISCLAIMER and PATENTS sections in the
+README file.
+
+This program is released under the terms of the license contained
+in the file LICENSE.
+------------------------------------------------------------------ */
+
+/* #include "bzlib_private.h" */
+
+/*---------------------------------------------*/
+/*--- Fallback O(N log(N)^2) sorting ---*/
+/*--- algorithm, for repetitive blocks ---*/
+/*---------------------------------------------*/
+
+/*---------------------------------------------*/
+static
+inline
+void fallbackSimpleSort(uint32_t* fmap,
+ uint32_t* eclass,
+ int32_t lo,
+ int32_t hi)
+{
+ int32_t i, j, tmp;
+ uint32_t ec_tmp;
+
+ if (lo == hi) return;
+
+ if (hi - lo > 3) {
+ for (i = hi-4; i >= lo; i--) {
+ tmp = fmap[i];
+ ec_tmp = eclass[tmp];
+ for (j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4)
+ fmap[j-4] = fmap[j];
+ fmap[j-4] = tmp;
+ }
+ }
+
+ for (i = hi-1; i >= lo; i--) {
+ tmp = fmap[i];
+ ec_tmp = eclass[tmp];
+ for (j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++)
+ fmap[j-1] = fmap[j];
+ fmap[j-1] = tmp;
+ }
+}
+
+
+/*---------------------------------------------*/
+#define fswap(zz1, zz2) \
+{ \
+ int32_t zztmp = zz1; \
+ zz1 = zz2; \
+ zz2 = zztmp; \
+}
+
+#define fvswap(zzp1, zzp2, zzn) \
+{ \
+ int32_t yyp1 = (zzp1); \
+ int32_t yyp2 = (zzp2); \
+ int32_t yyn = (zzn); \
+ while (yyn > 0) { \
+ fswap(fmap[yyp1], fmap[yyp2]); \
+ yyp1++; \
+ yyp2++; \
+ yyn--; \
+ } \
+}
+
+
+#define fmin(a,b) ((a) < (b)) ? (a) : (b)
+
+#define fpush(lz,hz) { \
+ stackLo[sp] = lz; \
+ stackHi[sp] = hz; \
+ sp++; \
+}
+
+#define fpop(lz,hz) { \
+ sp--; \
+ lz = stackLo[sp]; \
+ hz = stackHi[sp]; \
+}
+
+#define FALLBACK_QSORT_SMALL_THRESH 10
+#define FALLBACK_QSORT_STACK_SIZE 100
+
+
+static
+void fallbackQSort3(uint32_t* fmap,
+ uint32_t* eclass,
+ int32_t loSt,
+ int32_t hiSt)
+{
+ int32_t unLo, unHi, ltLo, gtHi, n, m;
+ int32_t sp, lo, hi;
+ uint32_t med, r, r3;
+ int32_t stackLo[FALLBACK_QSORT_STACK_SIZE];
+ int32_t stackHi[FALLBACK_QSORT_STACK_SIZE];
+
+ r = 0;
+
+ sp = 0;
+ fpush(loSt, hiSt);
+
+ while (sp > 0) {
+ AssertH(sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004);
+
+ fpop(lo, hi);
+ if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
+ fallbackSimpleSort(fmap, eclass, lo, hi);
+ continue;
+ }
+
+ /* Random partitioning. Median of 3 sometimes fails to
+ * avoid bad cases. Median of 9 seems to help but
+ * looks rather expensive. This too seems to work but
+ * is cheaper. Guidance for the magic constants
+ * 7621 and 32768 is taken from Sedgewick's algorithms
+ * book, chapter 35.
+ */
+ r = ((r * 7621) + 1) % 32768;
+ r3 = r % 3;
+ if (r3 == 0)
+ med = eclass[fmap[lo]];
+ else if (r3 == 1)
+ med = eclass[fmap[(lo+hi)>>1]];
+ else
+ med = eclass[fmap[hi]];
+
+ unLo = ltLo = lo;
+ unHi = gtHi = hi;
+
+ while (1) {
+ while (1) {
+ if (unLo > unHi) break;
+ n = (int32_t)eclass[fmap[unLo]] - (int32_t)med;
+ if (n == 0) {
+ fswap(fmap[unLo], fmap[ltLo]);
+ ltLo++;
+ unLo++;
+ continue;
+ };
+ if (n > 0) break;
+ unLo++;
+ }
+ while (1) {
+ if (unLo > unHi) break;
+ n = (int32_t)eclass[fmap[unHi]] - (int32_t)med;
+ if (n == 0) {
+ fswap(fmap[unHi], fmap[gtHi]);
+ gtHi--; unHi--;
+ continue;
+ };
+ if (n < 0) break;
+ unHi--;
+ }
+ if (unLo > unHi) break;
+ fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
+ }
+
+ AssertD(unHi == unLo-1, "fallbackQSort3(2)");
+
+ if (gtHi < ltLo) continue;
+
+ n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
+ m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
+
+ n = lo + unLo - ltLo - 1;
+ m = hi - (gtHi - unHi) + 1;
+
+ if (n - lo > hi - m) {
+ fpush(lo, n);
+ fpush(m, hi);
+ } else {
+ fpush(m, hi);
+ fpush(lo, n);
+ }
+ }
+}
+
+#undef fmin
+#undef fpush
+#undef fpop
+#undef fswap
+#undef fvswap
+#undef FALLBACK_QSORT_SMALL_THRESH
+#undef FALLBACK_QSORT_STACK_SIZE
+
+
+/*---------------------------------------------*/
+/* Pre:
+ * nblock > 0
+ * eclass exists for [0 .. nblock-1]
+ * ((UChar*)eclass) [0 .. nblock-1] holds block
+ * ptr exists for [0 .. nblock-1]
+ *
+ * Post:
+ * ((UChar*)eclass) [0 .. nblock-1] holds block
+ * All other areas of eclass destroyed
+ * fmap [0 .. nblock-1] holds sorted order
+ * bhtab[0 .. 2+(nblock/32)] destroyed
+*/
+
+#define SET_BH(zz) bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
+#define CLEAR_BH(zz) bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
+#define ISSET_BH(zz) (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
+#define WORD_BH(zz) bhtab[(zz) >> 5]
+#define UNALIGNED_BH(zz) ((zz) & 0x01f)
+
+static
+void fallbackSort(uint32_t* fmap,
+ uint32_t* eclass,
+ uint32_t* bhtab,
+ int32_t nblock)
+{
+ int32_t ftab[257];
+ int32_t ftabCopy[256];
+ int32_t H, i, j, k, l, r, cc, cc1;
+ int32_t nNotDone;
+ int32_t nBhtab;
+ UChar* eclass8 = (UChar*)eclass;
+
+ /*
+ * Initial 1-char radix sort to generate
+ * initial fmap and initial BH bits.
+ */
+ for (i = 0; i < 257; i++) ftab[i] = 0;
+ for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
+ for (i = 0; i < 256; i++) ftabCopy[i] = ftab[i];
+ for (i = 1; i < 257; i++) ftab[i] += ftab[i-1];
+
+ for (i = 0; i < nblock; i++) {
+ j = eclass8[i];
+ k = ftab[j] - 1;
+ ftab[j] = k;
+ fmap[k] = i;
+ }
+
+ nBhtab = 2 + (nblock / 32);
+ for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
+ for (i = 0; i < 256; i++) SET_BH(ftab[i]);
+
+ /*
+ * Inductively refine the buckets. Kind-of an
+ * "exponential radix sort" (!), inspired by the
+ * Manber-Myers suffix array construction algorithm.
+ */
+
+ /*-- set sentinel bits for block-end detection --*/
+ for (i = 0; i < 32; i++) {
+ SET_BH(nblock + 2*i);
+ CLEAR_BH(nblock + 2*i + 1);
+ }
+
+ /*-- the log(N) loop --*/
+ H = 1;
+ while (1) {
+ j = 0;
+ for (i = 0; i < nblock; i++) {
+ if (ISSET_BH(i))
+ j = i;
+ k = fmap[i] - H;
+ if (k < 0)
+ k += nblock;
+ eclass[k] = j;
+ }
+
+ nNotDone = 0;
+ r = -1;
+ while (1) {
+
+ /*-- find the next non-singleton bucket --*/
+ k = r + 1;
+ while (ISSET_BH(k) && UNALIGNED_BH(k))
+ k++;
+ if (ISSET_BH(k)) {
+ while (WORD_BH(k) == 0xffffffff) k += 32;
+ while (ISSET_BH(k)) k++;
+ }
+ l = k - 1;
+ if (l >= nblock)
+ break;
+ while (!ISSET_BH(k) && UNALIGNED_BH(k))
+ k++;
+ if (!ISSET_BH(k)) {
+ while (WORD_BH(k) == 0x00000000) k += 32;
+ while (!ISSET_BH(k)) k++;
+ }
+ r = k - 1;
+ if (r >= nblock)
+ break;
+
+ /*-- now [l, r] bracket current bucket --*/
+ if (r > l) {
+ nNotDone += (r - l + 1);
+ fallbackQSort3(fmap, eclass, l, r);
+
+ /*-- scan bucket and generate header bits-- */
+ cc = -1;
+ for (i = l; i <= r; i++) {
+ cc1 = eclass[fmap[i]];
+ if (cc != cc1) {
+ SET_BH(i);
+ cc = cc1;
+ };
+ }
+ }
+ }
+
+ H *= 2;
+ if (H > nblock || nNotDone == 0)
+ break;
+ }
+
+ /*
+ * Reconstruct the original block in
+ * eclass8 [0 .. nblock-1], since the
+ * previous phase destroyed it.
+ */
+ j = 0;
+ for (i = 0; i < nblock; i++) {
+ while (ftabCopy[j] == 0)
+ j++;
+ ftabCopy[j]--;
+ eclass8[fmap[i]] = (UChar)j;
+ }
+ AssertH(j < 256, 1005);
+}
+
+#undef SET_BH
+#undef CLEAR_BH
+#undef ISSET_BH
+#undef WORD_BH
+#undef UNALIGNED_BH
+
+
+/*---------------------------------------------*/
+/*--- The main, O(N^2 log(N)) sorting ---*/
+/*--- algorithm. Faster for "normal" ---*/
+/*--- non-repetitive blocks. ---*/
+/*---------------------------------------------*/
+
+/*---------------------------------------------*/
+static
+inline
+Bool mainGtU(
+ uint32_t i1,
+ uint32_t i2,
+ UChar* block,
+ uint16_t* quadrant,
+ uint32_t nblock,
+ int32_t* budget)
+{
+ int32_t k;
+ UChar c1, c2;
+ uint16_t s1, s2;
+
+///unrolling
+ AssertD(i1 != i2, "mainGtU");
+ /* 1 */
+ c1 = block[i1]; c2 = block[i2];
+ if (c1 != c2) return (c1 > c2);
+ i1++; i2++;
+ /* 2 */
+ c1 = block[i1]; c2 = block[i2];
+ if (c1 != c2) return (c1 > c2);
+ i1++; i2++;
+ /* 3 */
+ c1 = block[i1]; c2 = block[i2];
+ if (c1 != c2) return (c1 > c2);
+ i1++; i2++;
+ /* 4 */
+ c1 = block[i1]; c2 = block[i2];
+ if (c1 != c2) return (c1 > c2);
+ i1++; i2++;
+ /* 5 */
+ c1 = block[i1]; c2 = block[i2];
+ if (c1 != c2) return (c1 > c2);
+ i1++; i2++;
+ /* 6 */
+ c1 = block[i1]; c2 = block[i2];
+ if (c1 != c2) return (c1 > c2);
+ i1++; i2++;
+ /* 7 */
+ c1 = block[i1]; c2 = block[i2];
+ if (c1 != c2) return (c1 > c2);
+ i1++; i2++;
+ /* 8 */
+ c1 = block[i1]; c2 = block[i2];
+ if (c1 != c2) return (c1 > c2);
+ i1++; i2++;
+ /* 9 */
+ c1 = block[i1]; c2 = block[i2];
+ if (c1 != c2) return (c1 > c2);
+ i1++; i2++;
+ /* 10 */
+ c1 = block[i1]; c2 = block[i2];
+ if (c1 != c2) return (c1 > c2);
+ i1++; i2++;
+ /* 11 */
+ c1 = block[i1]; c2 = block[i2];
+ if (c1 != c2) return (c1 > c2);
+ i1++; i2++;
+ /* 12 */
+ c1 = block[i1]; c2 = block[i2];
+ if (c1 != c2) return (c1 > c2);
+ i1++; i2++;
+
+ k = nblock + 8;
+
+///unrolling
+ do {
+ /* 1 */
+ c1 = block[i1]; c2 = block[i2];
+ if (c1 != c2) return (c1 > c2);
+ s1 = quadrant[i1]; s2 = quadrant[i2];
+ if (s1 != s2) return (s1 > s2);
+ i1++; i2++;
+ /* 2 */
+ c1 = block[i1]; c2 = block[i2];
+ if (c1 != c2) return (c1 > c2);
+ s1 = quadrant[i1]; s2 = quadrant[i2];
+ if (s1 != s2) return (s1 > s2);
+ i1++; i2++;
+ /* 3 */
+ c1 = block[i1]; c2 = block[i2];
+ if (c1 != c2) return (c1 > c2);
+ s1 = quadrant[i1]; s2 = quadrant[i2];
+ if (s1 != s2) return (s1 > s2);
+ i1++; i2++;
+ /* 4 */
+ c1 = block[i1]; c2 = block[i2];
+ if (c1 != c2) return (c1 > c2);
+ s1 = quadrant[i1]; s2 = quadrant[i2];
+ if (s1 != s2) return (s1 > s2);
+ i1++; i2++;
+ /* 5 */
+ c1 = block[i1]; c2 = block[i2];
+ if (c1 != c2) return (c1 > c2);
+ s1 = quadrant[i1]; s2 = quadrant[i2];
+ if (s1 != s2) return (s1 > s2);
+ i1++; i2++;
+ /* 6 */
+ c1 = block[i1]; c2 = block[i2];
+ if (c1 != c2) return (c1 > c2);
+ s1 = quadrant[i1]; s2 = quadrant[i2];
+ if (s1 != s2) return (s1 > s2);
+ i1++; i2++;
+ /* 7 */
+ c1 = block[i1]; c2 = block[i2];
+ if (c1 != c2) return (c1 > c2);
+ s1 = quadrant[i1]; s2 = quadrant[i2];
+ if (s1 != s2) return (s1 > s2);
+ i1++; i2++;
+ /* 8 */
+ c1 = block[i1]; c2 = block[i2];
+ if (c1 != c2) return (c1 > c2);
+ s1 = quadrant[i1]; s2 = quadrant[i2];
+ if (s1 != s2) return (s1 > s2);
+ i1++; i2++;
+
+ if (i1 >= nblock) i1 -= nblock;
+ if (i2 >= nblock) i2 -= nblock;
+
+ k -= 8;
+ (*budget)--;
+ } while (k >= 0);
+
+ return False;
+}
+
+
+/*---------------------------------------------*/
+/*
+ * Knuth's increments seem to work better
+ * than Incerpi-Sedgewick here. Possibly
+ * because the number of elems to sort is
+ * usually small, typically <= 20.
+ */
+static
+const int32_t incs[14] = {
+ 1, 4, 13, 40, 121, 364, 1093, 3280,
+ 9841, 29524, 88573, 265720,
+ 797161, 2391484
+};
+
+static
+void mainSimpleSort(uint32_t* ptr,
+ UChar* block,
+ uint16_t* quadrant,
+ int32_t nblock,
+ int32_t lo,
+ int32_t hi,
+ int32_t d,
+ int32_t* budget)
+{
+ int32_t i, j, h, bigN, hp;
+ uint32_t v;
+
+ bigN = hi - lo + 1;
+ if (bigN < 2) return;
+
+ hp = 0;
+ while (incs[hp] < bigN) hp++;
+ hp--;
+
+ for (; hp >= 0; hp--) {
+ h = incs[hp];
+
+ i = lo + h;
+ while (1) {
+
+///unrolling
+ /*-- copy 1 --*/
+ if (i > hi) break;
+ v = ptr[i];
+ j = i;
+ while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
+ ptr[j] = ptr[j-h];
+ j = j - h;
+ if (j <= (lo + h - 1)) break;
+ }
+ ptr[j] = v;
+ i++;
+
+ /*-- copy 2 --*/
+ if (i > hi) break;
+ v = ptr[i];
+ j = i;
+ while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
+ ptr[j] = ptr[j-h];
+ j = j - h;
+ if (j <= (lo + h - 1)) break;
+ }
+ ptr[j] = v;
+ i++;
+
+ /*-- copy 3 --*/
+ if (i > hi) break;
+ v = ptr[i];
+ j = i;
+ while (mainGtU (ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
+ ptr[j] = ptr[j-h];
+ j = j - h;
+ if (j <= (lo + h - 1)) break;
+ }
+ ptr[j] = v;
+ i++;
+
+ if (*budget < 0) return;
+ }
+ }
+}
+
+
+/*---------------------------------------------*/
+/*
+ * The following is an implementation of
+ * an elegant 3-way quicksort for strings,
+ * described in a paper "Fast Algorithms for
+ * Sorting and Searching Strings", by Robert
+ * Sedgewick and Jon L. Bentley.
+ */
+
+#define mswap(zz1, zz2) \
+{ \
+ int32_t zztmp = zz1; \
+ zz1 = zz2; \
+ zz2 = zztmp; \
+}
+
+#define mvswap(zzp1, zzp2, zzn) \
+{ \
+ int32_t yyp1 = (zzp1); \
+ int32_t yyp2 = (zzp2); \
+ int32_t yyn = (zzn); \
+ while (yyn > 0) { \
+ mswap(ptr[yyp1], ptr[yyp2]); \
+ yyp1++; \
+ yyp2++; \
+ yyn--; \
+ } \
+}
+
+static
+inline
+UChar mmed3(UChar a, UChar b, UChar c)
+{
+ UChar t;
+ if (a > b) {
+ t = a;
+ a = b;
+ b = t;
+ };
+ if (b > c) {
+ b = c;
+ if (a > b)
+ b = a;
+ }
+ return b;
+}
+
+#define mmin(a,b) ((a) < (b)) ? (a) : (b)
+
+#define mpush(lz,hz,dz) \
+{ \
+ stackLo[sp] = lz; \
+ stackHi[sp] = hz; \
+ stackD [sp] = dz; \
+ sp++; \
+}
+
+#define mpop(lz,hz,dz) \
+{ \
+ sp--; \
+ lz = stackLo[sp]; \
+ hz = stackHi[sp]; \
+ dz = stackD [sp]; \
+}
+
+
+#define mnextsize(az) (nextHi[az]-nextLo[az])
+
+#define mnextswap(az,bz) \
+{ \
+ int32_t tz; \
+ tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
+ tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
+ tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; \
+}
+
+#define MAIN_QSORT_SMALL_THRESH 20
+#define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
+#define MAIN_QSORT_STACK_SIZE 100
+
+static
+void mainQSort3(uint32_t* ptr,
+ UChar* block,
+ uint16_t* quadrant,
+ int32_t nblock,
+ int32_t loSt,
+ int32_t hiSt,
+ int32_t dSt,
+ int32_t* budget)
+{
+ int32_t unLo, unHi, ltLo, gtHi, n, m, med;
+ int32_t sp, lo, hi, d;
+
+ int32_t stackLo[MAIN_QSORT_STACK_SIZE];
+ int32_t stackHi[MAIN_QSORT_STACK_SIZE];
+ int32_t stackD [MAIN_QSORT_STACK_SIZE];
+
+ int32_t nextLo[3];
+ int32_t nextHi[3];
+ int32_t nextD [3];
+
+ sp = 0;
+ mpush(loSt, hiSt, dSt);
+
+ while (sp > 0) {
+ AssertH(sp < MAIN_QSORT_STACK_SIZE - 2, 1001);
+
+ mpop(lo, hi, d);
+ if (hi - lo < MAIN_QSORT_SMALL_THRESH
+ || d > MAIN_QSORT_DEPTH_THRESH
+ ) {
+ mainSimpleSort(ptr, block, quadrant, nblock, lo, hi, d, budget);
+ if (*budget < 0)
+ return;
+ continue;
+ }
+
+ med = (int32_t) mmed3(block[ptr[lo ] + d],
+ block[ptr[hi ] + d],
+ block[ptr[(lo+hi) >> 1] + d]);
+
+ unLo = ltLo = lo;
+ unHi = gtHi = hi;
+
+ while (1) {
+ while (1) {
+ if (unLo > unHi)
+ break;
+ n = ((int32_t)block[ptr[unLo]+d]) - med;
+ if (n == 0) {
+ mswap(ptr[unLo], ptr[ltLo]);
+ ltLo++;
+ unLo++;
+ continue;
+ };
+ if (n > 0) break;
+ unLo++;
+ }
+ while (1) {
+ if (unLo > unHi)
+ break;
+ n = ((int32_t)block[ptr[unHi]+d]) - med;
+ if (n == 0) {
+ mswap(ptr[unHi], ptr[gtHi]);
+ gtHi--;
+ unHi--;
+ continue;
+ };
+ if (n < 0) break;
+ unHi--;
+ }
+ if (unLo > unHi)
+ break;
+ mswap(ptr[unLo], ptr[unHi]);
+ unLo++;
+ unHi--;
+ }
+
+ AssertD(unHi == unLo-1, "mainQSort3(2)");
+
+ if (gtHi < ltLo) {
+ mpush(lo, hi, d + 1);
+ continue;
+ }
+
+ n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
+ m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
+
+ n = lo + unLo - ltLo - 1;
+ m = hi - (gtHi - unHi) + 1;
+
+ nextLo[0] = lo; nextHi[0] = n; nextD[0] = d;
+ nextLo[1] = m; nextHi[1] = hi; nextD[1] = d;
+ nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
+
+ if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
+ if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
+ if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
+
+ AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)");
+ AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)");
+
+ mpush (nextLo[0], nextHi[0], nextD[0]);
+ mpush (nextLo[1], nextHi[1], nextD[1]);
+ mpush (nextLo[2], nextHi[2], nextD[2]);
+ }
+}
+
+#undef mswap
+#undef mvswap
+#undef mpush
+#undef mpop
+#undef mmin
+#undef mnextsize
+#undef mnextswap
+#undef MAIN_QSORT_SMALL_THRESH
+#undef MAIN_QSORT_DEPTH_THRESH
+#undef MAIN_QSORT_STACK_SIZE
+
+
+/*---------------------------------------------*/
+/* Pre:
+ * nblock > N_OVERSHOOT
+ * block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
+ * ((UChar*)block32) [0 .. nblock-1] holds block
+ * ptr exists for [0 .. nblock-1]
+ *
+ * Post:
+ * ((UChar*)block32) [0 .. nblock-1] holds block
+ * All other areas of block32 destroyed
+ * ftab[0 .. 65536] destroyed
+ * ptr [0 .. nblock-1] holds sorted order
+ * if (*budget < 0), sorting was abandoned
+ */
+
+#define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
+#define SETMASK (1 << 21)
+#define CLEARMASK (~(SETMASK))
+
+static NOINLINE
+void mainSort(uint32_t* ptr,
+ UChar* block,
+ uint16_t* quadrant,
+ uint32_t* ftab,
+ int32_t nblock,
+ int32_t* budget)
+{
+ int32_t i, j, k, ss, sb;
+ int32_t runningOrder[256];
+ Bool bigDone[256];
+ int32_t copyStart[256];
+ int32_t copyEnd [256];
+ UChar c1;
+ int32_t numQSorted;
+ uint16_t s;
+
+ /*-- set up the 2-byte frequency table --*/
+ /* was: for (i = 65536; i >= 0; i--) ftab[i] = 0; */
+ memset(ftab, 0, 65537 * sizeof(ftab[0]));
+
+ j = block[0] << 8;
+ i = nblock-1;
+#if 0
+ for (; i >= 3; i -= 4) {
+ quadrant[i] = 0;
+ j = (j >> 8) |(((uint16_t)block[i]) << 8);
+ ftab[j]++;
+ quadrant[i-1] = 0;
+ j = (j >> 8) |(((uint16_t)block[i-1]) << 8);
+ ftab[j]++;
+ quadrant[i-2] = 0;
+ j = (j >> 8) |(((uint16_t)block[i-2]) << 8);
+ ftab[j]++;
+ quadrant[i-3] = 0;
+ j = (j >> 8) |(((uint16_t)block[i-3]) << 8);
+ ftab[j]++;
+ }
+#endif
+ for (; i >= 0; i--) {
+ quadrant[i] = 0;
+ j = (j >> 8) |(((uint16_t)block[i]) << 8);
+ ftab[j]++;
+ }
+
+ /*-- (emphasises close relationship of block & quadrant) --*/
+ for (i = 0; i < BZ_N_OVERSHOOT; i++) {
+ block [nblock+i] = block[i];
+ quadrant[nblock+i] = 0;
+ }
+
+ /*-- Complete the initial radix sort --*/
+ for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
+
+ s = block[0] << 8;
+ i = nblock-1;
+#if 0
+ for (; i >= 3; i -= 4) {
+ s = (s >> 8) | (block[i] << 8);
+ j = ftab[s] -1;
+ ftab[s] = j;
+ ptr[j] = i;
+ s = (s >> 8) | (block[i-1] << 8);
+ j = ftab[s] -1;
+ ftab[s] = j;
+ ptr[j] = i-1;
+ s = (s >> 8) | (block[i-2] << 8);
+ j = ftab[s] -1;
+ ftab[s] = j;
+ ptr[j] = i-2;
+ s = (s >> 8) | (block[i-3] << 8);
+ j = ftab[s] -1;
+ ftab[s] = j;
+ ptr[j] = i-3;
+ }
+#endif
+ for (; i >= 0; i--) {
+ s = (s >> 8) | (block[i] << 8);
+ j = ftab[s] -1;
+ ftab[s] = j;
+ ptr[j] = i;
+ }
+
+ /*
+ * Now ftab contains the first loc of every small bucket.
+ * Calculate the running order, from smallest to largest
+ * big bucket.
+ */
+ for (i = 0; i <= 255; i++) {
+ bigDone [i] = False;
+ runningOrder[i] = i;
+ }
+
+ {
+ int32_t vv;
+ /* was: int32_t h = 1; */
+ /* do h = 3 * h + 1; while (h <= 256); */
+ int32_t h = 364;
+
+ do {
+ h = h / 3;
+ for (i = h; i <= 255; i++) {
+ vv = runningOrder[i];
+ j = i;
+ while (BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv)) {
+ runningOrder[j] = runningOrder[j-h];
+ j = j - h;
+ if (j <= (h - 1)) goto zero;
+ }
+ zero:
+ runningOrder[j] = vv;
+ }
+ } while (h != 1);
+ }
+
+ /*
+ * The main sorting loop.
+ */
+
+ numQSorted = 0;
+
+ for (i = 0; i <= 255; i++) {
+
+ /*
+ * Process big buckets, starting with the least full.
+ * Basically this is a 3-step process in which we call
+ * mainQSort3 to sort the small buckets [ss, j], but
+ * also make a big effort to avoid the calls if we can.
+ */
+ ss = runningOrder[i];
+
+ /*
+ * Step 1:
+ * Complete the big bucket [ss] by quicksorting
+ * any unsorted small buckets [ss, j], for j != ss.
+ * Hopefully previous pointer-scanning phases have already
+ * completed many of the small buckets [ss, j], so
+ * we don't have to sort them at all.
+ */
+ for (j = 0; j <= 255; j++) {
+ if (j != ss) {
+ sb = (ss << 8) + j;
+ if (!(ftab[sb] & SETMASK)) {
+ int32_t lo = ftab[sb] & CLEARMASK;
+ int32_t hi = (ftab[sb+1] & CLEARMASK) - 1;
+ if (hi > lo) {
+ mainQSort3 (
+ ptr, block, quadrant, nblock,
+ lo, hi, BZ_N_RADIX, budget
+ );
+ numQSorted += (hi - lo + 1);
+ if (*budget < 0) return;
+ }
+ }
+ ftab[sb] |= SETMASK;
+ }
+ }
+
+ AssertH(!bigDone[ss], 1006);
+
+ /*
+ * Step 2:
+ * Now scan this big bucket [ss] so as to synthesise the
+ * sorted order for small buckets [t, ss] for all t,
+ * including, magically, the bucket [ss,ss] too.
+ * This will avoid doing Real Work in subsequent Step 1's.
+ */
+ {
+ for (j = 0; j <= 255; j++) {
+ copyStart[j] = ftab[(j << 8) + ss] & CLEARMASK;
+ copyEnd [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
+ }
+ for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
+ k = ptr[j] - 1;
+ if (k < 0)
+ k += nblock;
+ c1 = block[k];
+ if (!bigDone[c1])
+ ptr[copyStart[c1]++] = k;
+ }
+ for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
+ k = ptr[j]-1;
+ if (k < 0)
+ k += nblock;
+ c1 = block[k];
+ if (!bigDone[c1])
+ ptr[copyEnd[c1]--] = k;
+ }
+ }
+
+ AssertH((copyStart[ss]-1 == copyEnd[ss])
+ ||
+ /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
+ * Necessity for this case is demonstrated by compressing
+ * a sequence of approximately 48.5 million of character
+ * 251; 1.0.0/1.0.1 will then die here. */
+ (copyStart[ss] == 0 && copyEnd[ss] == nblock-1),
+ 1007)
+
+ for (j = 0; j <= 255; j++)
+ ftab[(j << 8) + ss] |= SETMASK;
+
+ /*
+ * Step 3:
+ * The [ss] big bucket is now done. Record this fact,
+ * and update the quadrant descriptors. Remember to
+ * update quadrants in the overshoot area too, if
+ * necessary. The "if (i < 255)" test merely skips
+ * this updating for the last bucket processed, since
+ * updating for the last bucket is pointless.
+ *
+ * The quadrant array provides a way to incrementally
+ * cache sort orderings, as they appear, so as to
+ * make subsequent comparisons in fullGtU() complete
+ * faster. For repetitive blocks this makes a big
+ * difference (but not big enough to be able to avoid
+ * the fallback sorting mechanism, exponential radix sort).
+ *
+ * The precise meaning is: at all times:
+ *
+ * for 0 <= i < nblock and 0 <= j <= nblock
+ *
+ * if block[i] != block[j],
+ *
+ * then the relative values of quadrant[i] and
+ * quadrant[j] are meaningless.
+ *
+ * else {
+ * if quadrant[i] < quadrant[j]
+ * then the string starting at i lexicographically
+ * precedes the string starting at j
+ *
+ * else if quadrant[i] > quadrant[j]
+ * then the string starting at j lexicographically
+ * precedes the string starting at i
+ *
+ * else
+ * the relative ordering of the strings starting
+ * at i and j has not yet been determined.
+ * }
+ */
+ bigDone[ss] = True;
+
+ if (i < 255) {
+ int32_t bbStart = ftab[ss << 8] & CLEARMASK;
+ int32_t bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
+ int32_t shifts = 0;
+
+ while ((bbSize >> shifts) > 65534) shifts++;
+
+ for (j = bbSize-1; j >= 0; j--) {
+ int32_t a2update = ptr[bbStart + j];
+ uint16_t qVal = (uint16_t)(j >> shifts);
+ quadrant[a2update] = qVal;
+ if (a2update < BZ_N_OVERSHOOT)
+ quadrant[a2update + nblock] = qVal;
+ }
+ AssertH(((bbSize-1) >> shifts) <= 65535, 1002);
+ }
+
+ }
+}
+
+#undef BIGFREQ
+#undef SETMASK
+#undef CLEARMASK
+
+
+/*---------------------------------------------*/
+/* Pre:
+ * nblock > 0
+ * arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
+ * ((UChar*)arr2)[0 .. nblock-1] holds block
+ * arr1 exists for [0 .. nblock-1]
+ *
+ * Post:
+ * ((UChar*)arr2) [0 .. nblock-1] holds block
+ * All other areas of block destroyed
+ * ftab[0 .. 65536] destroyed
+ * arr1[0 .. nblock-1] holds sorted order
+ */
+static NOINLINE
+void BZ2_blockSort(EState* s)
+{
+ /* In original bzip2 1.0.4, it's a parameter, but 30
+ * should work ok. */
+ enum { wfact = 30 };
+
+ uint32_t* ptr = s->ptr;
+ UChar* block = s->block;
+ uint32_t* ftab = s->ftab;
+ int32_t nblock = s->nblock;
+ uint16_t* quadrant;
+ int32_t budget;
+ int32_t i;
+
+ if (nblock < 10000) {
+ fallbackSort(s->arr1, s->arr2, ftab, nblock);
+ } else {
+ /* Calculate the location for quadrant, remembering to get
+ * the alignment right. Assumes that &(block[0]) is at least
+ * 2-byte aligned -- this should be ok since block is really
+ * the first section of arr2.
+ */
+ i = nblock + BZ_N_OVERSHOOT;
+ if (i & 1) i++;
+ quadrant = (uint16_t*)(&(block[i]));
+
+ /* (wfact-1) / 3 puts the default-factor-30
+ * transition point at very roughly the same place as
+ * with v0.1 and v0.9.0.
+ * Not that it particularly matters any more, since the
+ * resulting compressed stream is now the same regardless
+ * of whether or not we use the main sort or fallback sort.
+ */
+ budget = nblock * ((wfact-1) / 3);
+
+ mainSort(ptr, block, quadrant, ftab, nblock, &budget);
+ if (budget < 0) {
+ fallbackSort(s->arr1, s->arr2, ftab, nblock);
+ }
+ }
+
+ s->origPtr = -1;
+ for (i = 0; i < s->nblock; i++)
+ if (ptr[i] == 0) {
+ s->origPtr = i; break;
+ };
+
+ AssertH(s->origPtr != -1, 1003);
+}
+
+
+/*-------------------------------------------------------------*/
+/*--- end blocksort.c ---*/
+/*-------------------------------------------------------------*/