blob: cddbfcbeada493926ba8ae38e38d2a820b9764c3 [file] [log] [blame]
Denis Vlasenko77f1ec12007-10-13 03:36:03 +00001/*
2 * bzip2 is written by Julian Seward <jseward@bzip.org>.
3 * Adapted for busybox by Denys Vlasenko <vda.linux@googlemail.com>.
4 * See README and LICENSE files in this directory for more information.
5 */
6
7/*-------------------------------------------------------------*/
8/*--- Block sorting machinery ---*/
9/*--- blocksort.c ---*/
10/*-------------------------------------------------------------*/
11
12/* ------------------------------------------------------------------
13This file is part of bzip2/libbzip2, a program and library for
14lossless, block-sorting data compression.
15
16bzip2/libbzip2 version 1.0.4 of 20 December 2006
17Copyright (C) 1996-2006 Julian Seward <jseward@bzip.org>
18
19Please read the WARNING, DISCLAIMER and PATENTS sections in the
20README file.
21
22This program is released under the terms of the license contained
23in the file LICENSE.
24------------------------------------------------------------------ */
25
26/* #include "bzlib_private.h" */
27
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +000028#define mswap(zz1, zz2) \
29{ \
30 int32_t zztmp = zz1; \
31 zz1 = zz2; \
32 zz2 = zztmp; \
33}
34
35static
36/* No measurable speed gain with inlining */
37/* ALWAYS_INLINE */
38void mvswap(uint32_t* ptr, int32_t zzp1, int32_t zzp2, int32_t zzn)
39{
40 while (zzn > 0) {
41 mswap(ptr[zzp1], ptr[zzp2]);
42 zzp1++;
43 zzp2++;
44 zzn--;
45 }
46}
47
48static
49ALWAYS_INLINE
50int32_t mmin(int32_t a, int32_t b)
51{
52 return (a < b) ? a : b;
53}
54
55
Denis Vlasenko77f1ec12007-10-13 03:36:03 +000056/*---------------------------------------------*/
57/*--- Fallback O(N log(N)^2) sorting ---*/
58/*--- algorithm, for repetitive blocks ---*/
59/*---------------------------------------------*/
60
61/*---------------------------------------------*/
62static
63inline
64void fallbackSimpleSort(uint32_t* fmap,
65 uint32_t* eclass,
66 int32_t lo,
67 int32_t hi)
68{
69 int32_t i, j, tmp;
70 uint32_t ec_tmp;
71
72 if (lo == hi) return;
73
74 if (hi - lo > 3) {
75 for (i = hi-4; i >= lo; i--) {
76 tmp = fmap[i];
77 ec_tmp = eclass[tmp];
78 for (j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4)
79 fmap[j-4] = fmap[j];
80 fmap[j-4] = tmp;
81 }
82 }
83
84 for (i = hi-1; i >= lo; i--) {
85 tmp = fmap[i];
86 ec_tmp = eclass[tmp];
87 for (j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++)
88 fmap[j-1] = fmap[j];
89 fmap[j-1] = tmp;
90 }
91}
92
93
94/*---------------------------------------------*/
Denis Vlasenko77f1ec12007-10-13 03:36:03 +000095#define fpush(lz,hz) { \
96 stackLo[sp] = lz; \
97 stackHi[sp] = hz; \
98 sp++; \
99}
100
101#define fpop(lz,hz) { \
102 sp--; \
103 lz = stackLo[sp]; \
104 hz = stackHi[sp]; \
105}
106
107#define FALLBACK_QSORT_SMALL_THRESH 10
108#define FALLBACK_QSORT_STACK_SIZE 100
109
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000110static
111void fallbackQSort3(uint32_t* fmap,
112 uint32_t* eclass,
113 int32_t loSt,
114 int32_t hiSt)
115{
116 int32_t unLo, unHi, ltLo, gtHi, n, m;
117 int32_t sp, lo, hi;
118 uint32_t med, r, r3;
119 int32_t stackLo[FALLBACK_QSORT_STACK_SIZE];
120 int32_t stackHi[FALLBACK_QSORT_STACK_SIZE];
121
122 r = 0;
123
124 sp = 0;
125 fpush(loSt, hiSt);
126
127 while (sp > 0) {
128 AssertH(sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004);
129
130 fpop(lo, hi);
131 if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
132 fallbackSimpleSort(fmap, eclass, lo, hi);
133 continue;
134 }
135
136 /* Random partitioning. Median of 3 sometimes fails to
137 * avoid bad cases. Median of 9 seems to help but
138 * looks rather expensive. This too seems to work but
139 * is cheaper. Guidance for the magic constants
140 * 7621 and 32768 is taken from Sedgewick's algorithms
141 * book, chapter 35.
142 */
143 r = ((r * 7621) + 1) % 32768;
144 r3 = r % 3;
145 if (r3 == 0)
146 med = eclass[fmap[lo]];
147 else if (r3 == 1)
148 med = eclass[fmap[(lo+hi)>>1]];
149 else
150 med = eclass[fmap[hi]];
151
152 unLo = ltLo = lo;
153 unHi = gtHi = hi;
154
155 while (1) {
156 while (1) {
157 if (unLo > unHi) break;
158 n = (int32_t)eclass[fmap[unLo]] - (int32_t)med;
159 if (n == 0) {
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000160 mswap(fmap[unLo], fmap[ltLo]);
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000161 ltLo++;
162 unLo++;
163 continue;
164 };
165 if (n > 0) break;
166 unLo++;
167 }
168 while (1) {
169 if (unLo > unHi) break;
170 n = (int32_t)eclass[fmap[unHi]] - (int32_t)med;
171 if (n == 0) {
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000172 mswap(fmap[unHi], fmap[gtHi]);
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000173 gtHi--; unHi--;
174 continue;
175 };
176 if (n < 0) break;
177 unHi--;
178 }
179 if (unLo > unHi) break;
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000180 mswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000181 }
182
183 AssertD(unHi == unLo-1, "fallbackQSort3(2)");
184
185 if (gtHi < ltLo) continue;
186
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000187 n = mmin(ltLo-lo, unLo-ltLo); mvswap(fmap, lo, unLo-n, n);
188 m = mmin(hi-gtHi, gtHi-unHi); mvswap(fmap, unLo, hi-m+1, m);
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000189
190 n = lo + unLo - ltLo - 1;
191 m = hi - (gtHi - unHi) + 1;
192
193 if (n - lo > hi - m) {
194 fpush(lo, n);
195 fpush(m, hi);
196 } else {
197 fpush(m, hi);
198 fpush(lo, n);
199 }
200 }
201}
202
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000203#undef fpush
204#undef fpop
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000205#undef FALLBACK_QSORT_SMALL_THRESH
206#undef FALLBACK_QSORT_STACK_SIZE
207
208
209/*---------------------------------------------*/
210/* Pre:
211 * nblock > 0
212 * eclass exists for [0 .. nblock-1]
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000213 * ((uint8_t*)eclass) [0 .. nblock-1] holds block
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000214 * ptr exists for [0 .. nblock-1]
215 *
216 * Post:
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000217 * ((uint8_t*)eclass) [0 .. nblock-1] holds block
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000218 * All other areas of eclass destroyed
219 * fmap [0 .. nblock-1] holds sorted order
220 * bhtab[0 .. 2+(nblock/32)] destroyed
221*/
222
223#define SET_BH(zz) bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
224#define CLEAR_BH(zz) bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
225#define ISSET_BH(zz) (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
226#define WORD_BH(zz) bhtab[(zz) >> 5]
227#define UNALIGNED_BH(zz) ((zz) & 0x01f)
228
229static
230void fallbackSort(uint32_t* fmap,
231 uint32_t* eclass,
232 uint32_t* bhtab,
233 int32_t nblock)
234{
235 int32_t ftab[257];
236 int32_t ftabCopy[256];
237 int32_t H, i, j, k, l, r, cc, cc1;
238 int32_t nNotDone;
239 int32_t nBhtab;
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000240 uint8_t* eclass8 = (uint8_t*)eclass;
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000241
242 /*
243 * Initial 1-char radix sort to generate
244 * initial fmap and initial BH bits.
245 */
246 for (i = 0; i < 257; i++) ftab[i] = 0;
247 for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
248 for (i = 0; i < 256; i++) ftabCopy[i] = ftab[i];
Denis Vlasenko6a9154b2007-10-14 07:49:48 +0000249
250 j = ftab[0]; /* bbox: optimized */
251 for (i = 1; i < 257; i++) {
252 j += ftab[i];
253 ftab[i] = j;
254 }
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000255
256 for (i = 0; i < nblock; i++) {
257 j = eclass8[i];
258 k = ftab[j] - 1;
259 ftab[j] = k;
260 fmap[k] = i;
261 }
262
Denis Vlasenko6a9154b2007-10-14 07:49:48 +0000263 nBhtab = 2 + ((uint32_t)nblock / 32); /* bbox: unsigned div is easier */
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000264 for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
265 for (i = 0; i < 256; i++) SET_BH(ftab[i]);
266
267 /*
268 * Inductively refine the buckets. Kind-of an
269 * "exponential radix sort" (!), inspired by the
270 * Manber-Myers suffix array construction algorithm.
271 */
272
273 /*-- set sentinel bits for block-end detection --*/
274 for (i = 0; i < 32; i++) {
275 SET_BH(nblock + 2*i);
276 CLEAR_BH(nblock + 2*i + 1);
277 }
278
279 /*-- the log(N) loop --*/
280 H = 1;
281 while (1) {
282 j = 0;
283 for (i = 0; i < nblock; i++) {
284 if (ISSET_BH(i))
285 j = i;
286 k = fmap[i] - H;
287 if (k < 0)
288 k += nblock;
289 eclass[k] = j;
290 }
291
292 nNotDone = 0;
293 r = -1;
294 while (1) {
295
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000296 /*-- find the next non-singleton bucket --*/
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000297 k = r + 1;
298 while (ISSET_BH(k) && UNALIGNED_BH(k))
299 k++;
300 if (ISSET_BH(k)) {
301 while (WORD_BH(k) == 0xffffffff) k += 32;
302 while (ISSET_BH(k)) k++;
303 }
304 l = k - 1;
305 if (l >= nblock)
306 break;
307 while (!ISSET_BH(k) && UNALIGNED_BH(k))
308 k++;
309 if (!ISSET_BH(k)) {
310 while (WORD_BH(k) == 0x00000000) k += 32;
311 while (!ISSET_BH(k)) k++;
312 }
313 r = k - 1;
314 if (r >= nblock)
315 break;
316
317 /*-- now [l, r] bracket current bucket --*/
318 if (r > l) {
319 nNotDone += (r - l + 1);
320 fallbackQSort3(fmap, eclass, l, r);
321
322 /*-- scan bucket and generate header bits-- */
323 cc = -1;
324 for (i = l; i <= r; i++) {
325 cc1 = eclass[fmap[i]];
326 if (cc != cc1) {
327 SET_BH(i);
328 cc = cc1;
329 };
330 }
331 }
332 }
333
334 H *= 2;
335 if (H > nblock || nNotDone == 0)
336 break;
337 }
338
339 /*
340 * Reconstruct the original block in
341 * eclass8 [0 .. nblock-1], since the
342 * previous phase destroyed it.
343 */
344 j = 0;
345 for (i = 0; i < nblock; i++) {
346 while (ftabCopy[j] == 0)
347 j++;
348 ftabCopy[j]--;
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000349 eclass8[fmap[i]] = (uint8_t)j;
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000350 }
351 AssertH(j < 256, 1005);
352}
353
354#undef SET_BH
355#undef CLEAR_BH
356#undef ISSET_BH
357#undef WORD_BH
358#undef UNALIGNED_BH
359
360
361/*---------------------------------------------*/
362/*--- The main, O(N^2 log(N)) sorting ---*/
363/*--- algorithm. Faster for "normal" ---*/
364/*--- non-repetitive blocks. ---*/
365/*---------------------------------------------*/
366
367/*---------------------------------------------*/
368static
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000369NOINLINE
370int mainGtU(
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000371 uint32_t i1,
372 uint32_t i2,
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000373 uint8_t* block,
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000374 uint16_t* quadrant,
375 uint32_t nblock,
376 int32_t* budget)
377{
378 int32_t k;
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000379 uint8_t c1, c2;
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000380 uint16_t s1, s2;
381
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000382/* Loop unrolling here is actually very useful
383 * (generated code is much simpler),
384 * code size increase is only 270 bytes (i386)
385 * but speeds up compression 10% overall
386 */
387
388#if CONFIG_BZIP2_FEATURE_SPEED >= 1
389
390#define TIMES_8(code) \
391 code; code; code; code; \
392 code; code; code; code;
393#define TIMES_12(code) \
394 code; code; code; code; \
395 code; code; code; code; \
396 code; code; code; code;
397
398#else
399
400#define TIMES_8(code) \
401{ \
402 int nn = 8; \
403 do { \
404 code; \
405 } while (--nn); \
406}
407#define TIMES_12(code) \
408{ \
409 int nn = 12; \
410 do { \
411 code; \
412 } while (--nn); \
413}
414
415#endif
416
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000417 AssertD(i1 != i2, "mainGtU");
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000418 TIMES_12(
419 c1 = block[i1]; c2 = block[i2];
420 if (c1 != c2) return (c1 > c2);
421 i1++; i2++;
422 )
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000423
424 k = nblock + 8;
425
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000426 do {
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000427 TIMES_8(
428 c1 = block[i1]; c2 = block[i2];
429 if (c1 != c2) return (c1 > c2);
430 s1 = quadrant[i1]; s2 = quadrant[i2];
431 if (s1 != s2) return (s1 > s2);
432 i1++; i2++;
433 )
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000434
435 if (i1 >= nblock) i1 -= nblock;
436 if (i2 >= nblock) i2 -= nblock;
437
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000438 (*budget)--;
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000439 k -= 8;
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000440 } while (k >= 0);
441
442 return False;
443}
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000444#undef TIMES_8
445#undef TIMES_12
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000446
447/*---------------------------------------------*/
448/*
449 * Knuth's increments seem to work better
450 * than Incerpi-Sedgewick here. Possibly
451 * because the number of elems to sort is
452 * usually small, typically <= 20.
453 */
454static
455const int32_t incs[14] = {
456 1, 4, 13, 40, 121, 364, 1093, 3280,
457 9841, 29524, 88573, 265720,
458 797161, 2391484
459};
460
461static
462void mainSimpleSort(uint32_t* ptr,
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000463 uint8_t* block,
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000464 uint16_t* quadrant,
465 int32_t nblock,
466 int32_t lo,
467 int32_t hi,
468 int32_t d,
469 int32_t* budget)
470{
471 int32_t i, j, h, bigN, hp;
472 uint32_t v;
473
474 bigN = hi - lo + 1;
475 if (bigN < 2) return;
476
477 hp = 0;
478 while (incs[hp] < bigN) hp++;
479 hp--;
480
481 for (; hp >= 0; hp--) {
482 h = incs[hp];
483
484 i = lo + h;
485 while (1) {
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000486 /*-- copy 1 --*/
487 if (i > hi) break;
488 v = ptr[i];
489 j = i;
490 while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
491 ptr[j] = ptr[j-h];
492 j = j - h;
493 if (j <= (lo + h - 1)) break;
494 }
495 ptr[j] = v;
496 i++;
497
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000498/* 1.5% overall speedup, +290 bytes */
499#if CONFIG_BZIP2_FEATURE_SPEED >= 3
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000500 /*-- copy 2 --*/
501 if (i > hi) break;
502 v = ptr[i];
503 j = i;
504 while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
505 ptr[j] = ptr[j-h];
506 j = j - h;
507 if (j <= (lo + h - 1)) break;
508 }
509 ptr[j] = v;
510 i++;
511
512 /*-- copy 3 --*/
513 if (i > hi) break;
514 v = ptr[i];
515 j = i;
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000516 while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000517 ptr[j] = ptr[j-h];
518 j = j - h;
519 if (j <= (lo + h - 1)) break;
520 }
521 ptr[j] = v;
522 i++;
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000523#endif
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000524 if (*budget < 0) return;
525 }
526 }
527}
528
529
530/*---------------------------------------------*/
531/*
532 * The following is an implementation of
533 * an elegant 3-way quicksort for strings,
534 * described in a paper "Fast Algorithms for
535 * Sorting and Searching Strings", by Robert
536 * Sedgewick and Jon L. Bentley.
537 */
538
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000539static
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000540ALWAYS_INLINE
541uint8_t mmed3(uint8_t a, uint8_t b, uint8_t c)
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000542{
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000543 uint8_t t;
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000544 if (a > b) {
545 t = a;
546 a = b;
547 b = t;
548 };
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000549 /* here b >= a */
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000550 if (b > c) {
551 b = c;
552 if (a > b)
553 b = a;
554 }
555 return b;
556}
557
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000558#define mpush(lz,hz,dz) \
559{ \
560 stackLo[sp] = lz; \
561 stackHi[sp] = hz; \
562 stackD [sp] = dz; \
563 sp++; \
564}
565
566#define mpop(lz,hz,dz) \
567{ \
568 sp--; \
569 lz = stackLo[sp]; \
570 hz = stackHi[sp]; \
571 dz = stackD [sp]; \
572}
573
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000574#define mnextsize(az) (nextHi[az] - nextLo[az])
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000575
576#define mnextswap(az,bz) \
577{ \
578 int32_t tz; \
579 tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
580 tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
581 tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; \
582}
583
584#define MAIN_QSORT_SMALL_THRESH 20
585#define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
586#define MAIN_QSORT_STACK_SIZE 100
587
588static
589void mainQSort3(uint32_t* ptr,
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000590 uint8_t* block,
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000591 uint16_t* quadrant,
592 int32_t nblock,
593 int32_t loSt,
594 int32_t hiSt,
595 int32_t dSt,
596 int32_t* budget)
597{
598 int32_t unLo, unHi, ltLo, gtHi, n, m, med;
599 int32_t sp, lo, hi, d;
600
601 int32_t stackLo[MAIN_QSORT_STACK_SIZE];
602 int32_t stackHi[MAIN_QSORT_STACK_SIZE];
603 int32_t stackD [MAIN_QSORT_STACK_SIZE];
604
605 int32_t nextLo[3];
606 int32_t nextHi[3];
607 int32_t nextD [3];
608
609 sp = 0;
610 mpush(loSt, hiSt, dSt);
611
612 while (sp > 0) {
613 AssertH(sp < MAIN_QSORT_STACK_SIZE - 2, 1001);
614
615 mpop(lo, hi, d);
616 if (hi - lo < MAIN_QSORT_SMALL_THRESH
617 || d > MAIN_QSORT_DEPTH_THRESH
618 ) {
619 mainSimpleSort(ptr, block, quadrant, nblock, lo, hi, d, budget);
620 if (*budget < 0)
621 return;
622 continue;
623 }
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000624 med = (int32_t) mmed3(block[ptr[lo ] + d],
625 block[ptr[hi ] + d],
626 block[ptr[(lo+hi) >> 1] + d]);
627
628 unLo = ltLo = lo;
629 unHi = gtHi = hi;
630
631 while (1) {
632 while (1) {
633 if (unLo > unHi)
634 break;
635 n = ((int32_t)block[ptr[unLo]+d]) - med;
636 if (n == 0) {
637 mswap(ptr[unLo], ptr[ltLo]);
638 ltLo++;
639 unLo++;
640 continue;
641 };
642 if (n > 0) break;
643 unLo++;
644 }
645 while (1) {
646 if (unLo > unHi)
647 break;
648 n = ((int32_t)block[ptr[unHi]+d]) - med;
649 if (n == 0) {
650 mswap(ptr[unHi], ptr[gtHi]);
651 gtHi--;
652 unHi--;
653 continue;
654 };
655 if (n < 0) break;
656 unHi--;
657 }
658 if (unLo > unHi)
659 break;
660 mswap(ptr[unLo], ptr[unHi]);
661 unLo++;
662 unHi--;
663 }
664
665 AssertD(unHi == unLo-1, "mainQSort3(2)");
666
667 if (gtHi < ltLo) {
668 mpush(lo, hi, d + 1);
669 continue;
670 }
671
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000672 n = mmin(ltLo-lo, unLo-ltLo); mvswap(ptr, lo, unLo-n, n);
673 m = mmin(hi-gtHi, gtHi-unHi); mvswap(ptr, unLo, hi-m+1, m);
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000674
675 n = lo + unLo - ltLo - 1;
676 m = hi - (gtHi - unHi) + 1;
677
678 nextLo[0] = lo; nextHi[0] = n; nextD[0] = d;
679 nextLo[1] = m; nextHi[1] = hi; nextD[1] = d;
680 nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
681
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000682 if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
683 if (mnextsize(1) < mnextsize(2)) mnextswap(1, 2);
684 if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000685
686 AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)");
687 AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)");
688
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000689 mpush(nextLo[0], nextHi[0], nextD[0]);
690 mpush(nextLo[1], nextHi[1], nextD[1]);
691 mpush(nextLo[2], nextHi[2], nextD[2]);
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000692 }
693}
694
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000695#undef mpush
696#undef mpop
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000697#undef mnextsize
698#undef mnextswap
699#undef MAIN_QSORT_SMALL_THRESH
700#undef MAIN_QSORT_DEPTH_THRESH
701#undef MAIN_QSORT_STACK_SIZE
702
703
704/*---------------------------------------------*/
705/* Pre:
Denis Vlasenko6cee58e2007-11-04 15:43:26 +0000706 * nblock > N_OVERSHOOT
707 * block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
708 * ((uint8_t*)block32) [0 .. nblock-1] holds block
709 * ptr exists for [0 .. nblock-1]
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000710 *
711 * Post:
Denis Vlasenko6cee58e2007-11-04 15:43:26 +0000712 * ((uint8_t*)block32) [0 .. nblock-1] holds block
713 * All other areas of block32 destroyed
714 * ftab[0 .. 65536] destroyed
715 * ptr [0 .. nblock-1] holds sorted order
716 * if (*budget < 0), sorting was abandoned
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000717 */
718
719#define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
720#define SETMASK (1 << 21)
721#define CLEARMASK (~(SETMASK))
722
723static NOINLINE
724void mainSort(uint32_t* ptr,
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000725 uint8_t* block,
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000726 uint16_t* quadrant,
727 uint32_t* ftab,
728 int32_t nblock,
729 int32_t* budget)
730{
731 int32_t i, j, k, ss, sb;
732 int32_t runningOrder[256];
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000733 Bool bigDone[256];
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000734 int32_t copyStart[256];
735 int32_t copyEnd [256];
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000736 uint8_t c1;
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000737 int32_t numQSorted;
738 uint16_t s;
739
740 /*-- set up the 2-byte frequency table --*/
741 /* was: for (i = 65536; i >= 0; i--) ftab[i] = 0; */
742 memset(ftab, 0, 65537 * sizeof(ftab[0]));
743
744 j = block[0] << 8;
Denis Vlasenko6a9154b2007-10-14 07:49:48 +0000745 i = nblock - 1;
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000746/* 3%, +300 bytes */
747#if CONFIG_BZIP2_FEATURE_SPEED >= 2
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000748 for (; i >= 3; i -= 4) {
749 quadrant[i] = 0;
Denis Vlasenko6a9154b2007-10-14 07:49:48 +0000750 j = (j >> 8) | (((uint16_t)block[i]) << 8);
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000751 ftab[j]++;
752 quadrant[i-1] = 0;
Denis Vlasenko6a9154b2007-10-14 07:49:48 +0000753 j = (j >> 8) | (((uint16_t)block[i-1]) << 8);
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000754 ftab[j]++;
755 quadrant[i-2] = 0;
Denis Vlasenko6a9154b2007-10-14 07:49:48 +0000756 j = (j >> 8) | (((uint16_t)block[i-2]) << 8);
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000757 ftab[j]++;
758 quadrant[i-3] = 0;
Denis Vlasenko6a9154b2007-10-14 07:49:48 +0000759 j = (j >> 8) | (((uint16_t)block[i-3]) << 8);
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000760 ftab[j]++;
761 }
762#endif
763 for (; i >= 0; i--) {
764 quadrant[i] = 0;
Denis Vlasenko6a9154b2007-10-14 07:49:48 +0000765 j = (j >> 8) | (((uint16_t)block[i]) << 8);
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000766 ftab[j]++;
767 }
768
769 /*-- (emphasises close relationship of block & quadrant) --*/
770 for (i = 0; i < BZ_N_OVERSHOOT; i++) {
771 block [nblock+i] = block[i];
772 quadrant[nblock+i] = 0;
773 }
774
775 /*-- Complete the initial radix sort --*/
Denis Vlasenko6a9154b2007-10-14 07:49:48 +0000776 j = ftab[0]; /* bbox: optimized */
777 for (i = 1; i <= 65536; i++) {
778 j += ftab[i];
779 ftab[i] = j;
780 }
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000781
782 s = block[0] << 8;
Denis Vlasenko6a9154b2007-10-14 07:49:48 +0000783 i = nblock - 1;
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000784#if CONFIG_BZIP2_FEATURE_SPEED >= 2
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000785 for (; i >= 3; i -= 4) {
786 s = (s >> 8) | (block[i] << 8);
Denis Vlasenko6a9154b2007-10-14 07:49:48 +0000787 j = ftab[s] - 1;
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000788 ftab[s] = j;
789 ptr[j] = i;
790 s = (s >> 8) | (block[i-1] << 8);
Denis Vlasenko6a9154b2007-10-14 07:49:48 +0000791 j = ftab[s] - 1;
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000792 ftab[s] = j;
793 ptr[j] = i-1;
794 s = (s >> 8) | (block[i-2] << 8);
Denis Vlasenko6a9154b2007-10-14 07:49:48 +0000795 j = ftab[s] - 1;
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000796 ftab[s] = j;
797 ptr[j] = i-2;
798 s = (s >> 8) | (block[i-3] << 8);
Denis Vlasenko6a9154b2007-10-14 07:49:48 +0000799 j = ftab[s] - 1;
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000800 ftab[s] = j;
801 ptr[j] = i-3;
802 }
803#endif
804 for (; i >= 0; i--) {
805 s = (s >> 8) | (block[i] << 8);
Denis Vlasenko6a9154b2007-10-14 07:49:48 +0000806 j = ftab[s] - 1;
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000807 ftab[s] = j;
808 ptr[j] = i;
809 }
810
811 /*
812 * Now ftab contains the first loc of every small bucket.
813 * Calculate the running order, from smallest to largest
814 * big bucket.
815 */
816 for (i = 0; i <= 255; i++) {
817 bigDone [i] = False;
818 runningOrder[i] = i;
819 }
820
821 {
822 int32_t vv;
Denis Vlasenko6a9154b2007-10-14 07:49:48 +0000823 /* bbox: was: int32_t h = 1; */
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000824 /* do h = 3 * h + 1; while (h <= 256); */
Denis Vlasenko6a9154b2007-10-14 07:49:48 +0000825 uint32_t h = 364;
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000826
827 do {
Denis Vlasenko6a9154b2007-10-14 07:49:48 +0000828 /*h = h / 3;*/
829 h = (h * 171) >> 9; /* bbox: fast h/3 */
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000830 for (i = h; i <= 255; i++) {
831 vv = runningOrder[i];
832 j = i;
833 while (BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv)) {
834 runningOrder[j] = runningOrder[j-h];
835 j = j - h;
Denis Vlasenko6a9154b2007-10-14 07:49:48 +0000836 if (j <= (h - 1))
837 goto zero;
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000838 }
Denis Vlasenko6a9154b2007-10-14 07:49:48 +0000839 zero:
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000840 runningOrder[j] = vv;
841 }
842 } while (h != 1);
843 }
844
845 /*
846 * The main sorting loop.
847 */
848
849 numQSorted = 0;
850
851 for (i = 0; i <= 255; i++) {
852
853 /*
854 * Process big buckets, starting with the least full.
855 * Basically this is a 3-step process in which we call
856 * mainQSort3 to sort the small buckets [ss, j], but
857 * also make a big effort to avoid the calls if we can.
858 */
859 ss = runningOrder[i];
860
861 /*
862 * Step 1:
863 * Complete the big bucket [ss] by quicksorting
864 * any unsorted small buckets [ss, j], for j != ss.
865 * Hopefully previous pointer-scanning phases have already
866 * completed many of the small buckets [ss, j], so
867 * we don't have to sort them at all.
868 */
869 for (j = 0; j <= 255; j++) {
870 if (j != ss) {
871 sb = (ss << 8) + j;
872 if (!(ftab[sb] & SETMASK)) {
Denis Vlasenko6a9154b2007-10-14 07:49:48 +0000873 int32_t lo = ftab[sb] & CLEARMASK;
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000874 int32_t hi = (ftab[sb+1] & CLEARMASK) - 1;
875 if (hi > lo) {
Denis Vlasenko6a9154b2007-10-14 07:49:48 +0000876 mainQSort3(
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000877 ptr, block, quadrant, nblock,
878 lo, hi, BZ_N_RADIX, budget
879 );
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000880 if (*budget < 0) return;
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000881 numQSorted += (hi - lo + 1);
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000882 }
883 }
884 ftab[sb] |= SETMASK;
885 }
886 }
887
888 AssertH(!bigDone[ss], 1006);
889
890 /*
891 * Step 2:
892 * Now scan this big bucket [ss] so as to synthesise the
893 * sorted order for small buckets [t, ss] for all t,
894 * including, magically, the bucket [ss,ss] too.
895 * This will avoid doing Real Work in subsequent Step 1's.
896 */
897 {
898 for (j = 0; j <= 255; j++) {
899 copyStart[j] = ftab[(j << 8) + ss] & CLEARMASK;
900 copyEnd [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
901 }
902 for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
903 k = ptr[j] - 1;
904 if (k < 0)
905 k += nblock;
906 c1 = block[k];
907 if (!bigDone[c1])
908 ptr[copyStart[c1]++] = k;
909 }
910 for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
911 k = ptr[j]-1;
912 if (k < 0)
913 k += nblock;
914 c1 = block[k];
915 if (!bigDone[c1])
916 ptr[copyEnd[c1]--] = k;
917 }
918 }
919
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000920 /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
921 * Necessity for this case is demonstrated by compressing
922 * a sequence of approximately 48.5 million of character
923 * 251; 1.0.0/1.0.1 will then die here. */
924 AssertH((copyStart[ss]-1 == copyEnd[ss]) \
925 || (copyStart[ss] == 0 && copyEnd[ss] == nblock-1), 1007);
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000926
927 for (j = 0; j <= 255; j++)
928 ftab[(j << 8) + ss] |= SETMASK;
929
930 /*
931 * Step 3:
932 * The [ss] big bucket is now done. Record this fact,
933 * and update the quadrant descriptors. Remember to
934 * update quadrants in the overshoot area too, if
935 * necessary. The "if (i < 255)" test merely skips
936 * this updating for the last bucket processed, since
937 * updating for the last bucket is pointless.
938 *
939 * The quadrant array provides a way to incrementally
940 * cache sort orderings, as they appear, so as to
941 * make subsequent comparisons in fullGtU() complete
942 * faster. For repetitive blocks this makes a big
943 * difference (but not big enough to be able to avoid
944 * the fallback sorting mechanism, exponential radix sort).
945 *
946 * The precise meaning is: at all times:
947 *
948 * for 0 <= i < nblock and 0 <= j <= nblock
949 *
950 * if block[i] != block[j],
951 *
952 * then the relative values of quadrant[i] and
953 * quadrant[j] are meaningless.
954 *
955 * else {
956 * if quadrant[i] < quadrant[j]
957 * then the string starting at i lexicographically
958 * precedes the string starting at j
959 *
960 * else if quadrant[i] > quadrant[j]
961 * then the string starting at j lexicographically
962 * precedes the string starting at i
963 *
964 * else
965 * the relative ordering of the strings starting
966 * at i and j has not yet been determined.
967 * }
968 */
969 bigDone[ss] = True;
970
971 if (i < 255) {
972 int32_t bbStart = ftab[ss << 8] & CLEARMASK;
973 int32_t bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
974 int32_t shifts = 0;
975
976 while ((bbSize >> shifts) > 65534) shifts++;
977
978 for (j = bbSize-1; j >= 0; j--) {
Denis Vlasenko6a9154b2007-10-14 07:49:48 +0000979 int32_t a2update = ptr[bbStart + j];
980 uint16_t qVal = (uint16_t)(j >> shifts);
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000981 quadrant[a2update] = qVal;
982 if (a2update < BZ_N_OVERSHOOT)
983 quadrant[a2update + nblock] = qVal;
984 }
985 AssertH(((bbSize-1) >> shifts) <= 65535, 1002);
986 }
Denis Vlasenko77f1ec12007-10-13 03:36:03 +0000987 }
988}
989
990#undef BIGFREQ
991#undef SETMASK
992#undef CLEARMASK
993
994
995/*---------------------------------------------*/
996/* Pre:
997 * nblock > 0
998 * arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +0000999 * ((uint8_t*)arr2)[0 .. nblock-1] holds block
Denis Vlasenko77f1ec12007-10-13 03:36:03 +00001000 * arr1 exists for [0 .. nblock-1]
1001 *
1002 * Post:
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +00001003 * ((uint8_t*)arr2) [0 .. nblock-1] holds block
Denis Vlasenko77f1ec12007-10-13 03:36:03 +00001004 * All other areas of block destroyed
1005 * ftab[0 .. 65536] destroyed
1006 * arr1[0 .. nblock-1] holds sorted order
1007 */
1008static NOINLINE
1009void BZ2_blockSort(EState* s)
1010{
1011 /* In original bzip2 1.0.4, it's a parameter, but 30
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +00001012 * (which was the default) should work ok. */
Denis Vlasenko77f1ec12007-10-13 03:36:03 +00001013 enum { wfact = 30 };
1014
1015 uint32_t* ptr = s->ptr;
Denis Vlasenkoef3aabe2007-10-14 00:43:01 +00001016 uint8_t* block = s->block;
Denis Vlasenko77f1ec12007-10-13 03:36:03 +00001017 uint32_t* ftab = s->ftab;
1018 int32_t nblock = s->nblock;
1019 uint16_t* quadrant;
1020 int32_t budget;
1021 int32_t i;
1022
1023 if (nblock < 10000) {
1024 fallbackSort(s->arr1, s->arr2, ftab, nblock);
1025 } else {
1026 /* Calculate the location for quadrant, remembering to get
1027 * the alignment right. Assumes that &(block[0]) is at least
1028 * 2-byte aligned -- this should be ok since block is really
1029 * the first section of arr2.
1030 */
1031 i = nblock + BZ_N_OVERSHOOT;
1032 if (i & 1) i++;
1033 quadrant = (uint16_t*)(&(block[i]));
1034
1035 /* (wfact-1) / 3 puts the default-factor-30
1036 * transition point at very roughly the same place as
1037 * with v0.1 and v0.9.0.
1038 * Not that it particularly matters any more, since the
1039 * resulting compressed stream is now the same regardless
1040 * of whether or not we use the main sort or fallback sort.
1041 */
1042 budget = nblock * ((wfact-1) / 3);
1043
1044 mainSort(ptr, block, quadrant, ftab, nblock, &budget);
1045 if (budget < 0) {
1046 fallbackSort(s->arr1, s->arr2, ftab, nblock);
1047 }
1048 }
1049
1050 s->origPtr = -1;
1051 for (i = 0; i < s->nblock; i++)
1052 if (ptr[i] == 0) {
Denis Vlasenko6a9154b2007-10-14 07:49:48 +00001053 s->origPtr = i;
1054 break;
Denis Vlasenko77f1ec12007-10-13 03:36:03 +00001055 };
1056
1057 AssertH(s->origPtr != -1, 1003);
1058}
1059
1060
1061/*-------------------------------------------------------------*/
1062/*--- end blocksort.c ---*/
1063/*-------------------------------------------------------------*/