1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19 package org.apache.commons.compress.compressors.bzip2;
20
21 import java.util.BitSet;
22
23 /**
24 * Encapsulates the Burrows-Wheeler sorting algorithm needed by {@link
25 * BZip2CompressorOutputStream}.
26 *
27 * <p>This class is based on a Java port of Julian Seward's
28 * blocksort.c in his libbzip2</p>
29 *
30 * <p>The Burrows-Wheeler transform is a reversible transform of the
31 * original data that is supposed to group similiar bytes close to
32 * each other. The idea is to sort all permutations of the input and
33 * only keep the last byte of each permutation. E.g. for "Commons
34 * Compress" you'd get:</p>
35 *
36 * <pre>
37 * CompressCommons
38 * Commons Compress
39 * CompressCommons
40 * essCommons Compr
41 * mmons CompressCo
42 * mons CompressCom
43 * mpressCommons Co
44 * ns CompressCommo
45 * ommons CompressC
46 * ompressCommons C
47 * ons CompressComm
48 * pressCommons Com
49 * ressCommons Comp
50 * s CompressCommon
51 * sCommons Compres
52 * ssCommons Compre
53 * </pre>
54 *
55 * <p>Which results in a new text "ss romooCCmmpnse", in adition the
56 * index of the first line that contained the original text is kept -
57 * in this case it is 1. The idea is that in a long English text all
58 * permutations that start with "he" are likely suffixes of a "the" and
59 * thus they end in "t" leading to a larger block of "t"s that can
60 * better be compressed by the subsequent Move-to-Front, run-length
61 * und Huffman encoding steps.</p>
62 *
63 * <p>For more information see for example:</p>
64 * <ul>
65 * <li><a
66 * href="http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf">Burrows,
67 * M. and Wheeler, D.: A Block-sorting Lossless Data Compression
68 * Algorithm</a></li>
69 * <li><a href="http://webglimpse.net/pubs/suffix.pdf">Manber, U. and
70 * Myers, G.: Suffix arrays: A new method for on-line string
71 * searches</a></li>
72 * <li><a
73 * href="http://www.cs.tufts.edu/~nr/comp150fp/archive/bob-sedgewick/fast-strings.pdf">Bentley,
74 * J.L. and Sedgewick, R.: Fast Algorithms for Sorting and Searching
75 * Strings</a></li>
76 * </ul>
77 *
78 * @NotThreadSafe
79 */
80 class BlockSort {
81
82 /*
83 * Some of the constructs used in the C code cannot be ported
84 * literally to Java - for example macros, unsigned types. Some
85 * code has been hand-tuned to improve performance. In order to
86 * avoid memory pressure some structures are reused for several
87 * blocks and some memory is even shared between sorting and the
88 * MTF stage even though either algorithm uses it for its own
89 * purpose.
90 *
91 * Comments preserved from the actual C code are prefixed with
92 * "LBZ2:".
93 */
94
95 /*
96 * 2012-05-20 Stefan Bodewig:
97 *
98 * This class seems to mix several revisions of libbzip2's code.
99 * The mainSort function and those used by it look closer to the
100 * 0.9.5 version but show some variations introduced later. At
101 * the same time the logic of Compress 1.4 to randomize the block
102 * on bad input has been dropped after libbzip2 0.9.0 and replaced
103 * by a fallback sorting algorithm.
104 *
105 * I've added the fallbackSort function of 1.0.6 and tried to
106 * integrate it with the existing code without touching too much.
107 * I've also removed the now unused randomization code.
108 */
109
110 /*
111 * LBZ2: If you are ever unlucky/improbable enough to get a stack
112 * overflow whilst sorting, increase the following constant and
113 * try again. In practice I have never seen the stack go above 27
114 * elems, so the following limit seems very generous.
115 */
116 private static final int QSORT_STACK_SIZE = 1000;
117
118 private static final int FALLBACK_QSORT_STACK_SIZE = 100;
119
120 private static final int STACK_SIZE =
121 QSORT_STACK_SIZE < FALLBACK_QSORT_STACK_SIZE
122 ? FALLBACK_QSORT_STACK_SIZE : QSORT_STACK_SIZE;
123
124 /*
125 * Used when sorting. If too many long comparisons happen, we stop sorting,
126 * and use fallbackSort instead.
127 */
128 private int workDone;
129 private int workLimit;
130 private boolean firstAttempt;
131
132 private final int[] stack_ll = new int[STACK_SIZE]; // 4000 byte
133 private final int[] stack_hh = new int[STACK_SIZE]; // 4000 byte
134 private final int[] stack_dd = new int[QSORT_STACK_SIZE]; // 4000 byte
135
136 private final int[] mainSort_runningOrder = new int[256]; // 1024 byte
137 private final int[] mainSort_copy = new int[256]; // 1024 byte
138 private final boolean[] mainSort_bigDone = new boolean[256]; // 256 byte
139
140 private final int[] ftab = new int[65537]; // 262148 byte
141
142 /**
143 * Array instance identical to Data's sfmap, both are used only
144 * temporarily and indepently, so we do not need to allocate
145 * additional memory.
146 */
147 private final char[] quadrant;
148
149 BlockSort(final BZip2CompressorOutputStream.Data data) {
150 this.quadrant = data.sfmap;
151 }
152
153 void blockSort(final BZip2CompressorOutputStream.Data data, final int last) {
154 this.workLimit = WORK_FACTOR * last;
155 this.workDone = 0;
156 this.firstAttempt = true;
157
158 if (last + 1 < 10000) {
159 fallbackSort(data, last);
160 } else {
161 mainSort(data, last);
162
163 if (this.firstAttempt && (this.workDone > this.workLimit)) {
164 fallbackSort(data, last);
165 }
166 }
167
168 final int[] fmap = data.fmap;
169 data.origPtr = -1;
170 for (int i = 0; i <= last; i++) {
171 if (fmap[i] == 0) {
172 data.origPtr = i;
173 break;
174 }
175 }
176
177 // assert (data.origPtr != -1) : data.origPtr;
178 }
179
180 /**
181 * Adapt fallbackSort to the expected interface of the rest of the
182 * code, in particular deal with the fact that block starts at
183 * offset 1 (in libbzip2 1.0.6 it starts at 0).
184 */
185 final void fallbackSort(final BZip2CompressorOutputStream.Data data,
186 final int last) {
187 data.block[0] = data.block[last + 1];
188 fallbackSort(data.fmap, data.block, last + 1);
189 for (int i = 0; i < last + 1; i++) {
190 --data.fmap[i];
191 }
192 for (int i = 0; i < last + 1; i++) {
193 if (data.fmap[i] == -1) {
194 data.fmap[i] = last;
195 break;
196 }
197 }
198 }
199
200 /*---------------------------------------------*/
201
202 /*---------------------------------------------*/
203 /*--- LBZ2: Fallback O(N log(N)^2) sorting ---*/
204 /*--- algorithm, for repetitive blocks ---*/
205 /*---------------------------------------------*/
206
207 /*
208 * This is the fallback sorting algorithm libbzip2 1.0.6 uses for
209 * repetitive or very short inputs.
210 *
211 * The idea is inspired by Manber-Myers string suffix sorting
212 * algorithm. First a bucket sort places each permutation of the
213 * block into a bucket based on its first byte. Permutations are
214 * represented by pointers to their first character kept in
215 * (partially) sorted order inside the array ftab.
216 *
217 * The next step visits all buckets in order and performs a
218 * quicksort on all permutations of the bucket based on the index
219 * of the bucket the second byte of the permutation belongs to,
220 * thereby forming new buckets. When arrived here the
221 * permutations are sorted up to the second character and we have
222 * buckets of permutations that are identical up to two
223 * characters.
224 *
225 * Repeat the step of quicksorting each bucket, now based on the
226 * bucket holding the sequence of the third and forth character
227 * leading to four byte buckets. Repeat this doubling of bucket
228 * sizes until all buckets only contain single permutations or the
229 * bucket size exceeds the block size.
230 *
231 * I.e.
232 *
233 * "abraba" form three buckets for the chars "a", "b", and "r" in
234 * the first step with
235 *
236 * fmap = { 'a:' 5, 3, 0, 'b:' 4, 1, 'r', 2 }
237 *
238 * when looking at the bucket of "a"s the second characters are in
239 * the buckets that start with fmap-index 0 (rolled over), 3 and 3
240 * respectively, forming two new buckets "aa" and "ab", so we get
241 *
242 * fmap = { 'aa:' 5, 'ab:' 3, 0, 'ba:' 4, 'br': 1, 'ra:' 2 }
243 *
244 * since the last bucket only contained a single item it didn't
245 * have to be sorted at all.
246 *
247 * There now is just one bucket with more than one permutation
248 * that remains to be sorted. For the permutation that starts
249 * with index 3 the third and forth char are in bucket 'aa' at
250 * index 0 and for the one starting at block index 0 they are in
251 * bucket 'ra' with sort index 5. The fully sorted order then becomes.
252 *
253 * fmap = { 5, 3, 0, 4, 1, 2 }
254 *
255 */
256
257 /**
258 * @param fmap points to the index of the starting point of a
259 * permutation inside the block of data in the current
260 * partially sorted order
261 * @param eclass points from the index of a character inside the
262 * block to the first index in fmap that contains the
263 * bucket of its suffix that is sorted in this step.
264 * @param lo lower boundary of the fmap-interval to be sorted
265 * @param hi upper boundary of the fmap-interval to be sorted
266 */
267 private void fallbackSimpleSort(int[] fmap,
268 int[] eclass,
269 int lo,
270 int hi) {
271 if (lo == hi) {
272 return;
273 }
274
275 int j;
276 if (hi - lo > 3) {
277 for (int i = hi - 4; i >= lo; i--) {
278 int tmp = fmap[i];
279 int ec_tmp = eclass[tmp];
280 for (j = i + 4; j <= hi && ec_tmp > eclass[fmap[j]];
281 j += 4) {
282 fmap[j - 4] = fmap[j];
283 }
284 fmap[j - 4] = tmp;
285 }
286 }
287
288 for (int i = hi - 1; i >= lo; i--) {
289 int tmp = fmap[i];
290 int ec_tmp = eclass[tmp];
291 for (j = i + 1; j <= hi && ec_tmp > eclass[fmap[j]]; j++) {
292 fmap[j - 1] = fmap[j];
293 }
294 fmap[j-1] = tmp;
295 }
296 }
297
298 private static final int FALLBACK_QSORT_SMALL_THRESH = 10;
299
300 /**
301 * swaps two values in fmap
302 */
303 private void fswap(int[] fmap, int zz1, int zz2) {
304 int zztmp = fmap[zz1];
305 fmap[zz1] = fmap[zz2];
306 fmap[zz2] = zztmp;
307 }
308
309 /**
310 * swaps two intervals starting at yyp1 and yyp2 of length yyn inside fmap.
311 */
312 private void fvswap(int[] fmap, int yyp1, int yyp2, int yyn) {
313 while (yyn > 0) {
314 fswap(fmap, yyp1, yyp2);
315 yyp1++; yyp2++; yyn--;
316 }
317 }
318
319 private int fmin(int a, int b) {
320 return a < b ? a : b;
321 }
322
323 private void fpush(int sp, int lz, int hz) {
324 stack_ll[sp] = lz;
325 stack_hh[sp] = hz;
326 }
327
328 private int[] fpop(int sp) {
329 return new int[] { stack_ll[sp], stack_hh[sp] };
330 }
331
332 /**
333 * @param fmap points to the index of the starting point of a
334 * permutation inside the block of data in the current
335 * partially sorted order
336 * @param eclass points from the index of a character inside the
337 * block to the first index in fmap that contains the
338 * bucket of its suffix that is sorted in this step.
339 * @param loSt lower boundary of the fmap-interval to be sorted
340 * @param hiSt upper boundary of the fmap-interval to be sorted
341 */
342 private void fallbackQSort3(int[] fmap,
343 int[] eclass,
344 int loSt,
345 int hiSt) {
346 int lo, unLo, ltLo, hi, unHi, gtHi, n;
347
348 long r = 0;
349 int sp = 0;
350 fpush(sp++, loSt, hiSt);
351
352 while (sp > 0) {
353 int[] s = fpop(--sp);
354 lo = s[0]; hi = s[1];
355
356 if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
357 fallbackSimpleSort(fmap, eclass, lo, hi);
358 continue;
359 }
360
361 /* LBZ2: Random partitioning. Median of 3 sometimes fails to
362 avoid bad cases. Median of 9 seems to help but
363 looks rather expensive. This too seems to work but
364 is cheaper. Guidance for the magic constants
365 7621 and 32768 is taken from Sedgewick's algorithms
366 book, chapter 35.
367 */
368 r = ((r * 7621) + 1) % 32768;
369 long r3 = r % 3, med;
370 if (r3 == 0) {
371 med = eclass[fmap[lo]];
372 } else if (r3 == 1) {
373 med = eclass[fmap[(lo + hi) >>> 1]];
374 } else {
375 med = eclass[fmap[hi]];
376 }
377
378 unLo = ltLo = lo;
379 unHi = gtHi = hi;
380
381 // looks like the ternary partition attributed to Wegner
382 // in the cited Sedgewick paper
383 while (true) {
384 while (true) {
385 if (unLo > unHi) {
386 break;
387 }
388 n = eclass[fmap[unLo]] - (int) med;
389 if (n == 0) {
390 fswap(fmap, unLo, ltLo);
391 ltLo++; unLo++;
392 continue;
393 }
394 if (n > 0) {
395 break;
396 }
397 unLo++;
398 }
399 while (true) {
400 if (unLo > unHi) {
401 break;
402 }
403 n = eclass[fmap[unHi]] - (int) med;
404 if (n == 0) {
405 fswap(fmap, unHi, gtHi);
406 gtHi--; unHi--;
407 continue;
408 }
409 if (n < 0) {
410 break;
411 }
412 unHi--;
413 }
414 if (unLo > unHi) {
415 break;
416 }
417 fswap(fmap, unLo, unHi); unLo++; unHi--;
418 }
419
420 if (gtHi < ltLo) {
421 continue;
422 }
423
424 n = fmin(ltLo - lo, unLo - ltLo);
425 fvswap(fmap, lo, unLo - n, n);
426 int m = fmin(hi - gtHi, gtHi - unHi);
427 fvswap(fmap, unHi + 1, hi - m + 1, m);
428
429 n = lo + unLo - ltLo - 1;
430 m = hi - (gtHi - unHi) + 1;
431
432 if (n - lo > hi - m) {
433 fpush(sp++, lo, n);
434 fpush(sp++, m, hi);
435 } else {
436 fpush(sp++, m, hi);
437 fpush(sp++, lo, n);
438 }
439 }
440 }
441
442
443 /*---------------------------------------------*/
444
445 private int[] eclass;
446
447 private int[] getEclass() {
448 return eclass == null
449 ? (eclass = new int[quadrant.length / 2]) : eclass;
450 }
451
452 /*
453 * The C code uses an array of ints (each int holding 32 flags) to
454 * represents the bucket-start flags (bhtab). It also contains
455 * optimizations to skip over 32 consecutively set or
456 * consecutively unset bits on word boundaries at once. For now
457 * I've chosen to use the simpler but potentially slower code
458 * using BitSet - also in the hope that using the BitSet#nextXXX
459 * methods may be fast enough.
460 */
461
462 /**
463 * @param fmap points to the index of the starting point of a
464 * permutation inside the block of data in the current
465 * partially sorted order
466 * @param block the original data
467 * @param nblock size of the block
468 * @param off offset of first byte to sort in block
469 */
470 final void fallbackSort(int[] fmap, byte[] block, int nblock) {
471 final int[] ftab = new int[257];
472 int H, i, j, k, l, r, cc, cc1;
473 int nNotDone;
474 int nBhtab;
475 final int[] eclass = getEclass();
476
477 for (i = 0; i < nblock; i++) {
478 eclass[i] = 0;
479 }
480 /*--
481 LBZ2: Initial 1-char radix sort to generate
482 initial fmap and initial BH bits.
483 --*/
484 for (i = 0; i < nblock; i++) {
485 ftab[block[i] & 0xff]++;
486 }
487 for (i = 1; i < 257; i++) {
488 ftab[i] += ftab[i - 1];
489 }
490
491 for (i = 0; i < nblock; i++) {
492 j = block[i] & 0xff;
493 k = ftab[j] - 1;
494 ftab[j] = k;
495 fmap[k] = i;
496 }
497
498 nBhtab = 64 + nblock;
499 BitSet bhtab = new BitSet(nBhtab);
500 for (i = 0; i < 256; i++) {
501 bhtab.set(ftab[i]);
502 }
503
504 /*--
505 LBZ2: Inductively refine the buckets. Kind-of an
506 "exponential radix sort" (!), inspired by the
507 Manber-Myers suffix array construction algorithm.
508 --*/
509
510 /*-- LBZ2: set sentinel bits for block-end detection --*/
511 for (i = 0; i < 32; i++) {
512 bhtab.set(nblock + 2 * i);
513 bhtab.clear(nblock + 2 * i + 1);
514 }
515
516 /*-- LBZ2: the log(N) loop --*/
517 H = 1;
518 while (true) {
519
520 j = 0;
521 for (i = 0; i < nblock; i++) {
522 if (bhtab.get(i)) {
523 j = i;
524 }
525 k = fmap[i] - H;
526 if (k < 0) {
527 k += nblock;
528 }
529 eclass[k] = j;
530 }
531
532 nNotDone = 0;
533 r = -1;
534 while (true) {
535
536 /*-- LBZ2: find the next non-singleton bucket --*/
537 k = r + 1;
538 k = bhtab.nextClearBit(k);
539 l = k - 1;
540 if (l >= nblock) {
541 break;
542 }
543 k = bhtab.nextSetBit(k + 1);
544 r = k - 1;
545 if (r >= nblock) {
546 break;
547 }
548
549 /*-- LBZ2: now [l, r] bracket current bucket --*/
550 if (r > l) {
551 nNotDone += (r - l + 1);
552 fallbackQSort3(fmap, eclass, l, r);
553
554 /*-- LBZ2: scan bucket and generate header bits-- */
555 cc = -1;
556 for (i = l; i <= r; i++) {
557 cc1 = eclass[fmap[i]];
558 if (cc != cc1) {
559 bhtab.set(i);
560 cc = cc1;
561 }
562 }
563 }
564 }
565
566 H *= 2;
567 if (H > nblock || nNotDone == 0) {
568 break;
569 }
570 }
571 }
572
573 /*---------------------------------------------*/
574
575 /*
576 * LBZ2: Knuth's increments seem to work better than Incerpi-Sedgewick here.
577 * Possibly because the number of elems to sort is usually small, typically
578 * <= 20.
579 */
580 private static final int[] INCS = { 1, 4, 13, 40, 121, 364, 1093, 3280,
581 9841, 29524, 88573, 265720, 797161,
582 2391484 };
583
584 /**
585 * This is the most hammered method of this class.
586 *
587 * <p>
588 * This is the version using unrolled loops. Normally I never use such ones
589 * in Java code. The unrolling has shown a noticable performance improvement
590 * on JRE 1.4.2 (Linux i586 / HotSpot Client). Of course it depends on the
591 * JIT compiler of the vm.
592 * </p>
593 */
594 private boolean mainSimpleSort(final BZip2CompressorOutputStream.Data dataShadow,
595 final int lo, final int hi, final int d,
596 final int lastShadow) {
597 final int bigN = hi - lo + 1;
598 if (bigN < 2) {
599 return this.firstAttempt && (this.workDone > this.workLimit);
600 }
601
602 int hp = 0;
603 while (INCS[hp] < bigN) {
604 hp++;
605 }
606
607 final int[] fmap = dataShadow.fmap;
608 final char[] quadrant = this.quadrant;
609 final byte[] block = dataShadow.block;
610 final int lastPlus1 = lastShadow + 1;
611 final boolean firstAttemptShadow = this.firstAttempt;
612 final int workLimitShadow = this.workLimit;
613 int workDoneShadow = this.workDone;
614
615 // Following block contains unrolled code which could be shortened by
616 // coding it in additional loops.
617
618 HP: while (--hp >= 0) {
619 final int h = INCS[hp];
620 final int mj = lo + h - 1;
621
622 for (int i = lo + h; i <= hi;) {
623 // copy
624 for (int k = 3; (i <= hi) && (--k >= 0); i++) {
625 final int v = fmap[i];
626 final int vd = v + d;
627 int j = i;
628
629 // for (int a;
630 // (j > mj) && mainGtU((a = fmap[j - h]) + d, vd,
631 // block, quadrant, lastShadow);
632 // j -= h) {
633 // fmap[j] = a;
634 // }
635 //
636 // unrolled version:
637
638 // start inline mainGTU
639 boolean onceRunned = false;
640 int a = 0;
641
642 HAMMER: while (true) {
643 if (onceRunned) {
644 fmap[j] = a;
645 if ((j -= h) <= mj) {
646 break HAMMER;
647 }
648 } else {
649 onceRunned = true;
650 }
651
652 a = fmap[j - h];
653 int i1 = a + d;
654 int i2 = vd;
655
656 // following could be done in a loop, but
657 // unrolled it for performance:
658 if (block[i1 + 1] == block[i2 + 1]) {
659 if (block[i1 + 2] == block[i2 + 2]) {
660 if (block[i1 + 3] == block[i2 + 3]) {
661 if (block[i1 + 4] == block[i2 + 4]) {
662 if (block[i1 + 5] == block[i2 + 5]) {
663 if (block[(i1 += 6)] == block[(i2 += 6)]) {
664 int x = lastShadow;
665 X: while (x > 0) {
666 x -= 4;
667
668 if (block[i1 + 1] == block[i2 + 1]) {
669 if (quadrant[i1] == quadrant[i2]) {
670 if (block[i1 + 2] == block[i2 + 2]) {
671 if (quadrant[i1 + 1] == quadrant[i2 + 1]) {
672 if (block[i1 + 3] == block[i2 + 3]) {
673 if (quadrant[i1 + 2] == quadrant[i2 + 2]) {
674 if (block[i1 + 4] == block[i2 + 4]) {
675 if (quadrant[i1 + 3] == quadrant[i2 + 3]) {
676 if ((i1 += 4) >= lastPlus1) {
677 i1 -= lastPlus1;
678 }
679 if ((i2 += 4) >= lastPlus1) {
680 i2 -= lastPlus1;
681 }
682 workDoneShadow++;
683 continue X;
684 } else if ((quadrant[i1 + 3] > quadrant[i2 + 3])) {
685 continue HAMMER;
686 } else {
687 break HAMMER;
688 }
689 } else if ((block[i1 + 4] & 0xff) > (block[i2 + 4] & 0xff)) {
690 continue HAMMER;
691 } else {
692 break HAMMER;
693 }
694 } else if ((quadrant[i1 + 2] > quadrant[i2 + 2])) {
695 continue HAMMER;
696 } else {
697 break HAMMER;
698 }
699 } else if ((block[i1 + 3] & 0xff) > (block[i2 + 3] & 0xff)) {
700 continue HAMMER;
701 } else {
702 break HAMMER;
703 }
704 } else if ((quadrant[i1 + 1] > quadrant[i2 + 1])) {
705 continue HAMMER;
706 } else {
707 break HAMMER;
708 }
709 } else if ((block[i1 + 2] & 0xff) > (block[i2 + 2] & 0xff)) {
710 continue HAMMER;
711 } else {
712 break HAMMER;
713 }
714 } else if ((quadrant[i1] > quadrant[i2])) {
715 continue HAMMER;
716 } else {
717 break HAMMER;
718 }
719 } else if ((block[i1 + 1] & 0xff) > (block[i2 + 1] & 0xff)) {
720 continue HAMMER;
721 } else {
722 break HAMMER;
723 }
724
725 }
726 break HAMMER;
727 } // while x > 0
728 else {
729 if ((block[i1] & 0xff) > (block[i2] & 0xff)) {
730 continue HAMMER;
731 } else {
732 break HAMMER;
733 }
734 }
735 } else if ((block[i1 + 5] & 0xff) > (block[i2 + 5] & 0xff)) {
736 continue HAMMER;
737 } else {
738 break HAMMER;
739 }
740 } else if ((block[i1 + 4] & 0xff) > (block[i2 + 4] & 0xff)) {
741 continue HAMMER;
742 } else {
743 break HAMMER;
744 }
745 } else if ((block[i1 + 3] & 0xff) > (block[i2 + 3] & 0xff)) {
746 continue HAMMER;
747 } else {
748 break HAMMER;
749 }
750 } else if ((block[i1 + 2] & 0xff) > (block[i2 + 2] & 0xff)) {
751 continue HAMMER;
752 } else {
753 break HAMMER;
754 }
755 } else if ((block[i1 + 1] & 0xff) > (block[i2 + 1] & 0xff)) {
756 continue HAMMER;
757 } else {
758 break HAMMER;
759 }
760
761 } // HAMMER
762 // end inline mainGTU
763
764 fmap[j] = v;
765 }
766
767 if (firstAttemptShadow && (i <= hi)
768 && (workDoneShadow > workLimitShadow)) {
769 break HP;
770 }
771 }
772 }
773
774 this.workDone = workDoneShadow;
775 return firstAttemptShadow && (workDoneShadow > workLimitShadow);
776 }
777
778 /*--
779 LBZ2: The following is an implementation of
780 an elegant 3-way quicksort for strings,
781 described in a paper "Fast Algorithms for
782 Sorting and Searching Strings", by Robert
783 Sedgewick and Jon L. Bentley.
784 --*/
785
786 private static void vswap(int[] fmap, int p1, int p2, int n) {
787 n += p1;
788 while (p1 < n) {
789 int t = fmap[p1];
790 fmap[p1++] = fmap[p2];
791 fmap[p2++] = t;
792 }
793 }
794
795 private static byte med3(byte a, byte b, byte c) {
796 return (a < b) ? (b < c ? b : a < c ? c : a) : (b > c ? b : a > c ? c
797 : a);
798 }
799
800 private static final int SMALL_THRESH = 20;
801 private static final int DEPTH_THRESH = 10;
802 private static final int WORK_FACTOR = 30;
803
804 /**
805 * Method "mainQSort3", file "blocksort.c", BZip2 1.0.2
806 */
807 private void mainQSort3(final BZip2CompressorOutputStream.Data dataShadow,
808 final int loSt, final int hiSt, final int dSt,
809 final int last) {
810 final int[] stack_ll = this.stack_ll;
811 final int[] stack_hh = this.stack_hh;
812 final int[] stack_dd = this.stack_dd;
813 final int[] fmap = dataShadow.fmap;
814 final byte[] block = dataShadow.block;
815
816 stack_ll[0] = loSt;
817 stack_hh[0] = hiSt;
818 stack_dd[0] = dSt;
819
820 for (int sp = 1; --sp >= 0;) {
821 final int lo = stack_ll[sp];
822 final int hi = stack_hh[sp];
823 final int d = stack_dd[sp];
824
825 if ((hi - lo < SMALL_THRESH) || (d > DEPTH_THRESH)) {
826 if (mainSimpleSort(dataShadow, lo, hi, d, last)) {
827 return;
828 }
829 } else {
830 final int d1 = d + 1;
831 final int med = med3(block[fmap[lo] + d1],
832 block[fmap[hi] + d1], block[fmap[(lo + hi) >>> 1] + d1]) & 0xff;
833
834 int unLo = lo;
835 int unHi = hi;
836 int ltLo = lo;
837 int gtHi = hi;
838
839 while (true) {
840 while (unLo <= unHi) {
841 final int n = (block[fmap[unLo] + d1] & 0xff)
842 - med;
843 if (n == 0) {
844 final int temp = fmap[unLo];
845 fmap[unLo++] = fmap[ltLo];
846 fmap[ltLo++] = temp;
847 } else if (n < 0) {
848 unLo++;
849 } else {
850 break;
851 }
852 }
853
854 while (unLo <= unHi) {
855 final int n = (block[fmap[unHi] + d1] & 0xff)
856 - med;
857 if (n == 0) {
858 final int temp = fmap[unHi];
859 fmap[unHi--] = fmap[gtHi];
860 fmap[gtHi--] = temp;
861 } else if (n > 0) {
862 unHi--;
863 } else {
864 break;
865 }
866 }
867
868 if (unLo <= unHi) {
869 final int temp = fmap[unLo];
870 fmap[unLo++] = fmap[unHi];
871 fmap[unHi--] = temp;
872 } else {
873 break;
874 }
875 }
876
877 if (gtHi < ltLo) {
878 stack_ll[sp] = lo;
879 stack_hh[sp] = hi;
880 stack_dd[sp] = d1;
881 sp++;
882 } else {
883 int n = ((ltLo - lo) < (unLo - ltLo)) ? (ltLo - lo)
884 : (unLo - ltLo);
885 vswap(fmap, lo, unLo - n, n);
886 int m = ((hi - gtHi) < (gtHi - unHi)) ? (hi - gtHi)
887 : (gtHi - unHi);
888 vswap(fmap, unLo, hi - m + 1, m);
889
890 n = lo + unLo - ltLo - 1;
891 m = hi - (gtHi - unHi) + 1;
892
893 stack_ll[sp] = lo;
894 stack_hh[sp] = n;
895 stack_dd[sp] = d;
896 sp++;
897
898 stack_ll[sp] = n + 1;
899 stack_hh[sp] = m - 1;
900 stack_dd[sp] = d1;
901 sp++;
902
903 stack_ll[sp] = m;
904 stack_hh[sp] = hi;
905 stack_dd[sp] = d;
906 sp++;
907 }
908 }
909 }
910 }
911
912 private static final int SETMASK = (1 << 21);
913 private static final int CLEARMASK = (~SETMASK);
914
915 final void mainSort(final BZip2CompressorOutputStream.Data dataShadow,
916 final int lastShadow) {
917 final int[] runningOrder = this.mainSort_runningOrder;
918 final int[] copy = this.mainSort_copy;
919 final boolean[] bigDone = this.mainSort_bigDone;
920 final int[] ftab = this.ftab;
921 final byte[] block = dataShadow.block;
922 final int[] fmap = dataShadow.fmap;
923 final char[] quadrant = this.quadrant;
924 final int workLimitShadow = this.workLimit;
925 final boolean firstAttemptShadow = this.firstAttempt;
926
927 // LBZ2: Set up the 2-byte frequency table
928 for (int i = 65537; --i >= 0;) {
929 ftab[i] = 0;
930 }
931
932 /*
933 * In the various block-sized structures, live data runs from 0 to
934 * last+NUM_OVERSHOOT_BYTES inclusive. First, set up the overshoot area
935 * for block.
936 */
937 for (int i = 0; i < BZip2Constants.NUM_OVERSHOOT_BYTES; i++) {
938 block[lastShadow + i + 2] = block[(i % (lastShadow + 1)) + 1];
939 }
940 for (int i = lastShadow + BZip2Constants.NUM_OVERSHOOT_BYTES +1; --i >= 0;) {
941 quadrant[i] = 0;
942 }
943 block[0] = block[lastShadow + 1];
944
945 // LBZ2: Complete the initial radix sort:
946
947 int c1 = block[0] & 0xff;
948 for (int i = 0; i <= lastShadow; i++) {
949 final int c2 = block[i + 1] & 0xff;
950 ftab[(c1 << 8) + c2]++;
951 c1 = c2;
952 }
953
954 for (int i = 1; i <= 65536; i++) {
955 ftab[i] += ftab[i - 1];
956 }
957
958 c1 = block[1] & 0xff;
959 for (int i = 0; i < lastShadow; i++) {
960 final int c2 = block[i + 2] & 0xff;
961 fmap[--ftab[(c1 << 8) + c2]] = i;
962 c1 = c2;
963 }
964
965 fmap[--ftab[((block[lastShadow + 1] & 0xff) << 8) + (block[1] & 0xff)]] = lastShadow;
966
967 /*
968 * LBZ2: Now ftab contains the first loc of every small bucket. Calculate the
969 * running order, from smallest to largest big bucket.
970 */
971 for (int i = 256; --i >= 0;) {
972 bigDone[i] = false;
973 runningOrder[i] = i;
974 }
975
976 for (int h = 364; h != 1;) {
977 h /= 3;
978 for (int i = h; i <= 255; i++) {
979 final int vv = runningOrder[i];
980 final int a = ftab[(vv + 1) << 8] - ftab[vv << 8];
981 final int b = h - 1;
982 int j = i;
983 for (int ro = runningOrder[j - h]; (ftab[(ro + 1) << 8] - ftab[ro << 8]) > a; ro = runningOrder[j
984 - h]) {
985 runningOrder[j] = ro;
986 j -= h;
987 if (j <= b) {
988 break;
989 }
990 }
991 runningOrder[j] = vv;
992 }
993 }
994
995 /*
996 * LBZ2: The main sorting loop.
997 */
998 for (int i = 0; i <= 255; i++) {
999 /*
1000 * LBZ2: Process big buckets, starting with the least full.
1001 */
1002 final int ss = runningOrder[i];
1003
1004 // Step 1:
1005 /*
1006 * LBZ2: Complete the big bucket [ss] by quicksorting any unsorted small
1007 * buckets [ss, j]. Hopefully previous pointer-scanning phases have
1008 * already completed many of the small buckets [ss, j], so we don't
1009 * have to sort them at all.
1010 */
1011 for (int j = 0; j <= 255; j++) {
1012 final int sb = (ss << 8) + j;
1013 final int ftab_sb = ftab[sb];
1014 if ((ftab_sb & SETMASK) != SETMASK) {
1015 final int lo = ftab_sb & CLEARMASK;
1016 final int hi = (ftab[sb + 1] & CLEARMASK) - 1;
1017 if (hi > lo) {
1018 mainQSort3(dataShadow, lo, hi, 2, lastShadow);
1019 if (firstAttemptShadow
1020 && (this.workDone > workLimitShadow)) {
1021 return;
1022 }
1023 }
1024 ftab[sb] = ftab_sb | SETMASK;
1025 }
1026 }
1027
1028 // Step 2:
1029 // LBZ2: Now scan this big bucket so as to synthesise the
1030 // sorted order for small buckets [t, ss] for all t != ss.
1031
1032 for (int j = 0; j <= 255; j++) {
1033 copy[j] = ftab[(j << 8) + ss] & CLEARMASK;
1034 }
1035
1036 for (int j = ftab[ss << 8] & CLEARMASK, hj = (ftab[(ss + 1) << 8] & CLEARMASK); j < hj; j++) {
1037 final int fmap_j = fmap[j];
1038 c1 = block[fmap_j] & 0xff;
1039 if (!bigDone[c1]) {
1040 fmap[copy[c1]] = (fmap_j == 0) ? lastShadow : (fmap_j - 1);
1041 copy[c1]++;
1042 }
1043 }
1044
1045 for (int j = 256; --j >= 0;) {
1046 ftab[(j << 8) + ss] |= SETMASK;
1047 }
1048
1049 // Step 3:
1050 /*
1051 * LBZ2: The ss big bucket is now done. Record this fact, and update the
1052 * quadrant descriptors. Remember to update quadrants in the
1053 * overshoot area too, if necessary. The "if (i < 255)" test merely
1054 * skips this updating for the last bucket processed, since updating
1055 * for the last bucket is pointless.
1056 */
1057 bigDone[ss] = true;
1058
1059 if (i < 255) {
1060 final int bbStart = ftab[ss << 8] & CLEARMASK;
1061 final int bbSize = (ftab[(ss + 1) << 8] & CLEARMASK) - bbStart;
1062 int shifts = 0;
1063
1064 while ((bbSize >> shifts) > 65534) {
1065 shifts++;
1066 }
1067
1068 for (int j = 0; j < bbSize; j++) {
1069 final int a2update = fmap[bbStart + j];
1070 final char qVal = (char) (j >> shifts);
1071 quadrant[a2update] = qVal;
1072 if (a2update < BZip2Constants.NUM_OVERSHOOT_BYTES) {
1073 quadrant[a2update + lastShadow + 1] = qVal;
1074 }
1075 }
1076 }
1077
1078 }
1079 }
1080
1081 }