View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   * http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.commons.compress.compressors.bzip2;
20  
21  import java.util.BitSet;
22  
23  /**
24   * Encapsulates the Burrows-Wheeler sorting algorithm needed by {@link
25   * BZip2CompressorOutputStream}.
26   *
27   * <p>This class is based on a Java port of Julian Seward's
28   * blocksort.c in his libbzip2</p>
29   *
30   * <p>The Burrows-Wheeler transform is a reversible transform of the
31   * original data that is supposed to group similiar bytes close to
32   * each other.  The idea is to sort all permutations of the input and
33   * only keep the last byte of each permutation.  E.g. for "Commons
34   * Compress" you'd get:</p>
35   *
36   * <pre>
37   *  CompressCommons
38   * Commons Compress
39   * CompressCommons 
40   * essCommons Compr
41   * mmons CompressCo
42   * mons CompressCom
43   * mpressCommons Co
44   * ns CompressCommo
45   * ommons CompressC
46   * ompressCommons C
47   * ons CompressComm
48   * pressCommons Com
49   * ressCommons Comp
50   * s CompressCommon
51   * sCommons Compres
52   * ssCommons Compre
53   * </pre>
54   *
55   * <p>Which results in a new text "ss romooCCmmpnse", in adition the
56   * index of the first line that contained the original text is kept -
57   * in this case it is 1.  The idea is that in a long English text all
58   * permutations that start with "he" are likely suffixes of a "the" and
59   * thus they end in "t" leading to a larger block of "t"s that can
60   * better be compressed by the subsequent Move-to-Front, run-length
61   * und Huffman encoding steps.</p>
62   *
63   * <p>For more information see for example:</p>
64   * <ul>
65   *   <li><a
66   *   href="http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf">Burrows,
67   *   M. and Wheeler, D.: A Block-sorting Lossless Data Compression
68   *   Algorithm</a></li>
69   *   <li><a href="http://webglimpse.net/pubs/suffix.pdf">Manber, U. and
70   *   Myers, G.: Suffix arrays: A new method for on-line string
71   *   searches</a></li>
72   *   <li><a
73   *   href="http://www.cs.tufts.edu/~nr/comp150fp/archive/bob-sedgewick/fast-strings.pdf">Bentley,
74   *   J.L. and Sedgewick, R.: Fast Algorithms for Sorting and Searching
75   *   Strings</a></li>
76   * </ul>
77   *
78   * @NotThreadSafe
79   */
80  class BlockSort {
81  
82      /*
83       * Some of the constructs used in the C code cannot be ported
84       * literally to Java - for example macros, unsigned types.  Some
85       * code has been hand-tuned to improve performance.  In order to
86       * avoid memory pressure some structures are reused for several
87       * blocks and some memory is even shared between sorting and the
88       * MTF stage even though either algorithm uses it for its own
89       * purpose.
90       *
91       * Comments preserved from the actual C code are prefixed with
92       * "LBZ2:".
93       */
94  
95      /*
96       * 2012-05-20 Stefan Bodewig:
97       *
98       * This class seems to mix several revisions of libbzip2's code.
99       * The mainSort function and those used by it look closer to the
100      * 0.9.5 version but show some variations introduced later.  At
101      * the same time the logic of Compress 1.4 to randomize the block
102      * on bad input has been dropped after libbzip2 0.9.0 and replaced
103      * by a fallback sorting algorithm.
104      *
105      * I've added the fallbackSort function of 1.0.6 and tried to
106      * integrate it with the existing code without touching too much.
107      * I've also removed the now unused randomization code.
108      */
109 
110     /*
111      * LBZ2: If you are ever unlucky/improbable enough to get a stack
112      * overflow whilst sorting, increase the following constant and
113      * try again. In practice I have never seen the stack go above 27
114      * elems, so the following limit seems very generous.
115      */
116     private static final int QSORT_STACK_SIZE = 1000;
117 
118     private static final int FALLBACK_QSORT_STACK_SIZE = 100;
119 
120     private static final int STACK_SIZE =
121         QSORT_STACK_SIZE < FALLBACK_QSORT_STACK_SIZE
122         ? FALLBACK_QSORT_STACK_SIZE : QSORT_STACK_SIZE;
123 
124     /*
125      * Used when sorting. If too many long comparisons happen, we stop sorting,
126      * and use fallbackSort instead.
127      */
128     private int workDone;
129     private int workLimit;
130     private boolean firstAttempt;
131 
132     private final int[] stack_ll = new int[STACK_SIZE]; // 4000 byte
133     private final int[] stack_hh = new int[STACK_SIZE]; // 4000 byte
134     private final int[] stack_dd = new int[QSORT_STACK_SIZE]; // 4000 byte
135 
136     private final int[] mainSort_runningOrder = new int[256]; // 1024 byte
137     private final int[] mainSort_copy = new int[256]; // 1024 byte
138     private final boolean[] mainSort_bigDone = new boolean[256]; // 256 byte
139 
140     private final int[] ftab = new int[65537]; // 262148 byte
141 
142     /**
143      * Array instance identical to Data's sfmap, both are used only
144      * temporarily and indepently, so we do not need to allocate
145      * additional memory.
146      */
147     private final char[] quadrant;
148 
149     BlockSort(final BZip2CompressorOutputStream.Data data) {
150         this.quadrant = data.sfmap;
151     }
152 
153     void blockSort(final BZip2CompressorOutputStream.Data data, final int last) {
154         this.workLimit = WORK_FACTOR * last;
155         this.workDone = 0;
156         this.firstAttempt = true;
157 
158         if (last + 1 < 10000) {
159             fallbackSort(data, last);
160         } else {
161             mainSort(data, last);
162 
163             if (this.firstAttempt && (this.workDone > this.workLimit)) {
164                 fallbackSort(data, last);
165             }
166         }
167 
168         final int[] fmap = data.fmap;
169         data.origPtr = -1;
170         for (int i = 0; i <= last; i++) {
171             if (fmap[i] == 0) {
172                 data.origPtr = i;
173                 break;
174             }
175         }
176 
177         // assert (data.origPtr != -1) : data.origPtr;
178     }
179 
180     /**
181      * Adapt fallbackSort to the expected interface of the rest of the
182      * code, in particular deal with the fact that block starts at
183      * offset 1 (in libbzip2 1.0.6 it starts at 0).
184      */
185     final void fallbackSort(final BZip2CompressorOutputStream.Data data,
186                             final int last) {
187         data.block[0] = data.block[last + 1];
188         fallbackSort(data.fmap, data.block, last + 1);
189         for (int i = 0; i < last + 1; i++) {
190             --data.fmap[i];
191         }
192         for (int i = 0; i < last + 1; i++) {
193             if (data.fmap[i] == -1) {
194                 data.fmap[i] = last;
195                 break;
196             }
197         }
198     }
199 
200 /*---------------------------------------------*/
201 
202 /*---------------------------------------------*/
203 /*--- LBZ2: Fallback O(N log(N)^2) sorting        ---*/
204 /*--- algorithm, for repetitive blocks      ---*/
205 /*---------------------------------------------*/
206 
207     /*
208      * This is the fallback sorting algorithm libbzip2 1.0.6 uses for
209      * repetitive or very short inputs.
210      *
211      * The idea is inspired by Manber-Myers string suffix sorting
212      * algorithm.  First a bucket sort places each permutation of the
213      * block into a bucket based on its first byte.  Permutations are
214      * represented by pointers to their first character kept in
215      * (partially) sorted order inside the array ftab.
216      *
217      * The next step visits all buckets in order and performs a
218      * quicksort on all permutations of the bucket based on the index
219      * of the bucket the second byte of the permutation belongs to,
220      * thereby forming new buckets.  When arrived here the
221      * permutations are sorted up to the second character and we have
222      * buckets of permutations that are identical up to two
223      * characters.
224      *
225      * Repeat the step of quicksorting each bucket, now based on the
226      * bucket holding the sequence of the third and forth character
227      * leading to four byte buckets.  Repeat this doubling of bucket
228      * sizes until all buckets only contain single permutations or the
229      * bucket size exceeds the block size.
230      *
231      * I.e.
232      *
233      * "abraba" form three buckets for the chars "a", "b", and "r" in
234      * the first step with
235      *
236      * fmap = { 'a:' 5, 3, 0, 'b:' 4, 1, 'r', 2 }
237      *
238      * when looking at the bucket of "a"s the second characters are in
239      * the buckets that start with fmap-index 0 (rolled over), 3 and 3
240      * respectively, forming two new buckets "aa" and "ab", so we get
241      *
242      * fmap = { 'aa:' 5, 'ab:' 3, 0, 'ba:' 4, 'br': 1, 'ra:' 2 }
243      *
244      * since the last bucket only contained a single item it didn't
245      * have to be sorted at all.
246      *
247      * There now is just one bucket with more than one permutation
248      * that remains to be sorted.  For the permutation that starts
249      * with index 3 the third and forth char are in bucket 'aa' at
250      * index 0 and for the one starting at block index 0 they are in
251      * bucket 'ra' with sort index 5.  The fully sorted order then becomes.
252      *
253      * fmap = { 5, 3, 0, 4, 1, 2 }
254      * 
255      */
256 
257     /**
258      * @param fmap points to the index of the starting point of a
259      *        permutation inside the block of data in the current
260      *        partially sorted order
261      * @param eclass points from the index of a character inside the
262      *        block to the first index in fmap that contains the
263      *        bucket of its suffix that is sorted in this step.
264      * @param lo lower boundary of the fmap-interval to be sorted 
265      * @param hi upper boundary of the fmap-interval to be sorted 
266      */
267     private void fallbackSimpleSort(final int[] fmap, 
268                                     final int[] eclass, 
269                                     final int lo, 
270                                     final int hi) {
271         if (lo == hi) {
272             return;
273         }
274 
275         int j;
276         if (hi - lo > 3) {
277             for (int i = hi - 4; i >= lo; i--) {
278                 final int tmp = fmap[i];
279                 final int ec_tmp = eclass[tmp];
280                 for (j = i + 4; j <= hi && ec_tmp > eclass[fmap[j]];
281                      j += 4) {
282                     fmap[j - 4] = fmap[j];
283                 }
284                 fmap[j - 4] = tmp;
285             }
286         }
287 
288         for (int i = hi - 1; i >= lo; i--) {
289             final int tmp = fmap[i];
290             final int ec_tmp = eclass[tmp];
291             for (j = i + 1; j <= hi && ec_tmp > eclass[fmap[j]]; j++) {
292                 fmap[j - 1] = fmap[j];
293             }
294             fmap[j-1] = tmp;
295         }
296     }
297 
298     private static final int FALLBACK_QSORT_SMALL_THRESH = 10;
299 
300     /**
301      * swaps two values in fmap
302      */
303     private void fswap(final int[] fmap, final int zz1, final int zz2) {
304         final int zztmp = fmap[zz1];
305         fmap[zz1] = fmap[zz2];
306         fmap[zz2] = zztmp;
307     }
308 
309     /**
310      * swaps two intervals starting at yyp1 and yyp2 of length yyn inside fmap.
311      */
312     private void fvswap(final int[] fmap, int yyp1, int yyp2, int yyn) {
313         while (yyn > 0) {
314             fswap(fmap, yyp1, yyp2);
315             yyp1++; yyp2++; yyn--;
316         }
317     }
318 
319     private int fmin(final int a, final int b) {
320         return a < b ? a : b;
321     }
322 
323     private void fpush(final int sp, final int lz, final int hz) {
324         stack_ll[sp] = lz;
325         stack_hh[sp] = hz;
326     }
327 
328     private int[] fpop(final int sp) {
329         return new int[] { stack_ll[sp], stack_hh[sp] };
330     }
331 
332     /**
333      * @param fmap points to the index of the starting point of a
334      *        permutation inside the block of data in the current
335      *        partially sorted order
336      * @param eclass points from the index of a character inside the
337      *        block to the first index in fmap that contains the
338      *        bucket of its suffix that is sorted in this step.
339      * @param loSt lower boundary of the fmap-interval to be sorted 
340      * @param hiSt upper boundary of the fmap-interval to be sorted 
341      */
342     private void fallbackQSort3(final int[] fmap, 
343                                 final int[] eclass, 
344                                 final int loSt, 
345                                 final int hiSt) {
346         int lo, unLo, ltLo, hi, unHi, gtHi, n;
347 
348         long r = 0;
349         int sp = 0;
350         fpush(sp++, loSt, hiSt);
351 
352         while (sp > 0) {
353             final int[] s = fpop(--sp);
354             lo = s[0]; hi = s[1];
355 
356             if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
357                 fallbackSimpleSort(fmap, eclass, lo, hi);
358                 continue;
359             }
360 
361             /* LBZ2: Random partitioning.  Median of 3 sometimes fails to
362                avoid bad cases.  Median of 9 seems to help but 
363                looks rather expensive.  This too seems to work but
364                is cheaper.  Guidance for the magic constants 
365                7621 and 32768 is taken from Sedgewick's algorithms
366                book, chapter 35.
367             */
368             r = ((r * 7621) + 1) % 32768;
369             final long r3 = r % 3;
370             long med;
371             if (r3 == 0) {
372                 med = eclass[fmap[lo]]; 
373             } else if (r3 == 1) {
374                 med = eclass[fmap[(lo + hi) >>> 1]];
375             } else {
376                 med = eclass[fmap[hi]];
377             }
378 
379             unLo = ltLo = lo;
380             unHi = gtHi = hi;
381 
382             // looks like the ternary partition attributed to Wegner
383             // in the cited Sedgewick paper
384             while (true) {
385                 while (true) {
386                     if (unLo > unHi) {
387                         break;
388                     }
389                     n = eclass[fmap[unLo]] - (int) med;
390                     if (n == 0) { 
391                         fswap(fmap, unLo, ltLo); 
392                         ltLo++; unLo++; 
393                         continue; 
394                     }
395                     if (n > 0) {
396                         break;
397                     }
398                     unLo++;
399                 }
400                 while (true) {
401                     if (unLo > unHi) {
402                         break;
403                     }
404                     n = eclass[fmap[unHi]] - (int) med;
405                     if (n == 0) {
406                         fswap(fmap, unHi, gtHi); 
407                         gtHi--; unHi--; 
408                         continue; 
409                     }
410                     if (n < 0) {
411                         break;
412                     }
413                     unHi--;
414                 }
415                 if (unLo > unHi) {
416                     break;
417                 }
418                 fswap(fmap, unLo, unHi); unLo++; unHi--;
419             }
420 
421             if (gtHi < ltLo) {
422                 continue;
423             }
424 
425             n = fmin(ltLo - lo, unLo - ltLo);
426             fvswap(fmap, lo, unLo - n, n);
427             int m = fmin(hi - gtHi, gtHi - unHi);
428             fvswap(fmap, unHi + 1, hi - m + 1, m);
429 
430             n = lo + unLo - ltLo - 1;
431             m = hi - (gtHi - unHi) + 1;
432 
433             if (n - lo > hi - m) {
434                 fpush(sp++, lo, n);
435                 fpush(sp++, m, hi);
436             } else {
437                 fpush(sp++, m, hi);
438                 fpush(sp++, lo, n);
439             }
440         }
441     }
442 
443 
444 /*---------------------------------------------*/
445 
446     private int[] eclass;
447 
448     private int[] getEclass() {
449         return eclass == null
450             ? (eclass = new int[quadrant.length / 2]) : eclass;
451     }
452 
453     /*
454      * The C code uses an array of ints (each int holding 32 flags) to
455      * represents the bucket-start flags (bhtab).  It also contains
456      * optimizations to skip over 32 consecutively set or
457      * consecutively unset bits on word boundaries at once.  For now
458      * I've chosen to use the simpler but potentially slower code
459      * using BitSet - also in the hope that using the BitSet#nextXXX
460      * methods may be fast enough.
461      */
462 
463     /**
464      * @param fmap points to the index of the starting point of a
465      *        permutation inside the block of data in the current
466      *        partially sorted order
467      * @param block the original data
468      * @param nblock size of the block
469      * @param off offset of first byte to sort in block
470      */
471     final void fallbackSort(final int[] fmap, final byte[] block, final int nblock) {
472         final int[] ftab = new int[257];
473         int H, i, j, k, l, r, cc, cc1;
474         int nNotDone;
475         int nBhtab;
476         final int[] eclass = getEclass();
477 
478         for (i = 0; i < nblock; i++) {
479             eclass[i] = 0;
480         }
481         /*--
482           LBZ2: Initial 1-char radix sort to generate
483           initial fmap and initial BH bits.
484           --*/
485         for (i = 0; i < nblock; i++) {
486             ftab[block[i] & 0xff]++;
487         }
488         for (i = 1; i < 257;    i++) {
489             ftab[i] += ftab[i - 1];
490         }
491 
492         for (i = 0; i < nblock; i++) {
493             j = block[i] & 0xff;
494             k = ftab[j] - 1;
495             ftab[j] = k;
496             fmap[k] = i;
497         }
498 
499         nBhtab = 64 + nblock;
500         final BitSet bhtab = new BitSet(nBhtab);
501         for (i = 0; i < 256; i++) {
502             bhtab.set(ftab[i]);
503         }
504 
505         /*--
506           LBZ2: Inductively refine the buckets.  Kind-of an
507           "exponential radix sort" (!), inspired by the
508           Manber-Myers suffix array construction algorithm.
509           --*/
510 
511         /*-- LBZ2: set sentinel bits for block-end detection --*/
512         for (i = 0; i < 32; i++) { 
513             bhtab.set(nblock + 2 * i);
514             bhtab.clear(nblock + 2 * i + 1);
515         }
516 
517         /*-- LBZ2: the log(N) loop --*/
518         H = 1;
519         while (true) {
520 
521             j = 0;
522             for (i = 0; i < nblock; i++) {
523                 if (bhtab.get(i)) {
524                     j = i;
525                 }
526                 k = fmap[i] - H;
527                 if (k < 0) {
528                     k += nblock;
529                 }
530                 eclass[k] = j;
531             }
532 
533             nNotDone = 0;
534             r = -1;
535             while (true) {
536 
537                 /*-- LBZ2: find the next non-singleton bucket --*/
538                 k = r + 1;
539                 k = bhtab.nextClearBit(k);
540                 l = k - 1;
541                 if (l >= nblock) {
542                     break;
543                 }
544                 k = bhtab.nextSetBit(k + 1);
545                 r = k - 1;
546                 if (r >= nblock) {
547                     break;
548                 }
549 
550                 /*-- LBZ2: now [l, r] bracket current bucket --*/
551                 if (r > l) {
552                     nNotDone += (r - l + 1);
553                     fallbackQSort3(fmap, eclass, l, r);
554 
555                     /*-- LBZ2: scan bucket and generate header bits-- */
556                     cc = -1;
557                     for (i = l; i <= r; i++) {
558                         cc1 = eclass[fmap[i]];
559                         if (cc != cc1) {
560                             bhtab.set(i);
561                             cc = cc1;
562                         }
563                     }
564                 }
565             }
566 
567             H *= 2;
568             if (H > nblock || nNotDone == 0) {
569                 break;
570             }
571         }
572     }
573 
574 /*---------------------------------------------*/
575 
576     /*
577      * LBZ2: Knuth's increments seem to work better than Incerpi-Sedgewick here.
578      * Possibly because the number of elems to sort is usually small, typically
579      * &lt;= 20.
580      */
581     private static final int[] INCS = { 1, 4, 13, 40, 121, 364, 1093, 3280,
582                                         9841, 29524, 88573, 265720, 797161,
583                                         2391484 };
584 
585     /**
586      * This is the most hammered method of this class.
587      *
588      * <p>
589      * This is the version using unrolled loops. Normally I never use such ones
590      * in Java code. The unrolling has shown a noticable performance improvement
591      * on JRE 1.4.2 (Linux i586 / HotSpot Client). Of course it depends on the
592      * JIT compiler of the vm.
593      * </p>
594      */
595     private boolean mainSimpleSort(final BZip2CompressorOutputStream.Data dataShadow,
596                                    final int lo, final int hi, final int d,
597                                    final int lastShadow) {
598         final int bigN = hi - lo + 1;
599         if (bigN < 2) {
600             return this.firstAttempt && (this.workDone > this.workLimit);
601         }
602 
603         int hp = 0;
604         while (INCS[hp] < bigN) {
605             hp++;
606         }
607 
608         final int[] fmap = dataShadow.fmap;
609         final char[] quadrant = this.quadrant;
610         final byte[] block = dataShadow.block;
611         final int lastPlus1 = lastShadow + 1;
612         final boolean firstAttemptShadow = this.firstAttempt;
613         final int workLimitShadow = this.workLimit;
614         int workDoneShadow = this.workDone;
615 
616         // Following block contains unrolled code which could be shortened by
617         // coding it in additional loops.
618 
619         HP: while (--hp >= 0) {
620             final int h = INCS[hp];
621             final int mj = lo + h - 1;
622 
623             for (int i = lo + h; i <= hi;) {
624                 // copy
625                 for (int k = 3; (i <= hi) && (--k >= 0); i++) {
626                     final int v = fmap[i];
627                     final int vd = v + d;
628                     int j = i;
629 
630                     // for (int a;
631                     // (j > mj) && mainGtU((a = fmap[j - h]) + d, vd,
632                     // block, quadrant, lastShadow);
633                     // j -= h) {
634                     // fmap[j] = a;
635                     // }
636                     //
637                     // unrolled version:
638 
639                     // start inline mainGTU
640                     boolean onceRunned = false;
641                     int a = 0;
642 
643                     HAMMER: while (true) {
644                         if (onceRunned) {
645                             fmap[j] = a;
646                             if ((j -= h) <= mj) {
647                                 break HAMMER;
648                             }
649                         } else {
650                             onceRunned = true;
651                         }
652 
653                         a = fmap[j - h];
654                         int i1 = a + d;
655                         int i2 = vd;
656 
657                         // following could be done in a loop, but
658                         // unrolled it for performance:
659                         if (block[i1 + 1] == block[i2 + 1]) {
660                             if (block[i1 + 2] == block[i2 + 2]) {
661                                 if (block[i1 + 3] == block[i2 + 3]) {
662                                     if (block[i1 + 4] == block[i2 + 4]) {
663                                         if (block[i1 + 5] == block[i2 + 5]) {
664                                             if (block[(i1 += 6)] == block[(i2 += 6)]) {
665                                                 int x = lastShadow;
666                                                 X: while (x > 0) {
667                                                     x -= 4;
668 
669                                                     if (block[i1 + 1] == block[i2 + 1]) {
670                                                         if (quadrant[i1] == quadrant[i2]) {
671                                                             if (block[i1 + 2] == block[i2 + 2]) {
672                                                                 if (quadrant[i1 + 1] == quadrant[i2 + 1]) {
673                                                                     if (block[i1 + 3] == block[i2 + 3]) {
674                                                                         if (quadrant[i1 + 2] == quadrant[i2 + 2]) {
675                                                                             if (block[i1 + 4] == block[i2 + 4]) {
676                                                                                 if (quadrant[i1 + 3] == quadrant[i2 + 3]) {
677                                                                                     if ((i1 += 4) >= lastPlus1) {
678                                                                                         i1 -= lastPlus1;
679                                                                                     }
680                                                                                     if ((i2 += 4) >= lastPlus1) {
681                                                                                         i2 -= lastPlus1;
682                                                                                     }
683                                                                                     workDoneShadow++;
684                                                                                     continue X;
685                                                                                 } else if ((quadrant[i1 + 3] > quadrant[i2 + 3])) {
686                                                                                     continue HAMMER;
687                                                                                 } else {
688                                                                                     break HAMMER;
689                                                                                 }
690                                                                             } else if ((block[i1 + 4] & 0xff) > (block[i2 + 4] & 0xff)) {
691                                                                                 continue HAMMER;
692                                                                             } else {
693                                                                                 break HAMMER;
694                                                                             }
695                                                                         } else if ((quadrant[i1 + 2] > quadrant[i2 + 2])) {
696                                                                             continue HAMMER;
697                                                                         } else {
698                                                                             break HAMMER;
699                                                                         }
700                                                                     } else if ((block[i1 + 3] & 0xff) > (block[i2 + 3] & 0xff)) {
701                                                                         continue HAMMER;
702                                                                     } else {
703                                                                         break HAMMER;
704                                                                     }
705                                                                 } else if ((quadrant[i1 + 1] > quadrant[i2 + 1])) {
706                                                                     continue HAMMER;
707                                                                 } else {
708                                                                     break HAMMER;
709                                                                 }
710                                                             } else if ((block[i1 + 2] & 0xff) > (block[i2 + 2] & 0xff)) {
711                                                                 continue HAMMER;
712                                                             } else {
713                                                                 break HAMMER;
714                                                             }
715                                                         } else if ((quadrant[i1] > quadrant[i2])) {
716                                                             continue HAMMER;
717                                                         } else {
718                                                             break HAMMER;
719                                                         }
720                                                     } else if ((block[i1 + 1] & 0xff) > (block[i2 + 1] & 0xff)) {
721                                                         continue HAMMER;
722                                                     } else {
723                                                         break HAMMER;
724                                                     }
725 
726                                                 }
727                                                 break HAMMER;
728                                             } // while x > 0
729                                             if ((block[i1] & 0xff) > (block[i2] & 0xff)) {
730                                                 continue HAMMER;
731                                             }
732                                             break HAMMER;
733                                         } else if ((block[i1 + 5] & 0xff) > (block[i2 + 5] & 0xff)) {
734                                             continue HAMMER;
735                                         } else {
736                                             break HAMMER;
737                                         }
738                                     } else if ((block[i1 + 4] & 0xff) > (block[i2 + 4] & 0xff)) {
739                                         continue HAMMER;
740                                     } else {
741                                         break HAMMER;
742                                     }
743                                 } else if ((block[i1 + 3] & 0xff) > (block[i2 + 3] & 0xff)) {
744                                     continue HAMMER;
745                                 } else {
746                                     break HAMMER;
747                                 }
748                             } else if ((block[i1 + 2] & 0xff) > (block[i2 + 2] & 0xff)) {
749                                 continue HAMMER;
750                             } else {
751                                 break HAMMER;
752                             }
753                         } else if ((block[i1 + 1] & 0xff) > (block[i2 + 1] & 0xff)) {
754                             continue HAMMER;
755                         } else {
756                             break HAMMER;
757                         }
758 
759                     } // HAMMER
760                     // end inline mainGTU
761 
762                     fmap[j] = v;
763                 }
764 
765                 if (firstAttemptShadow && (i <= hi)
766                     && (workDoneShadow > workLimitShadow)) {
767                     break HP;
768                 }
769             }
770         }
771 
772         this.workDone = workDoneShadow;
773         return firstAttemptShadow && (workDoneShadow > workLimitShadow);
774     }
775 
776 /*--
777    LBZ2: The following is an implementation of
778    an elegant 3-way quicksort for strings,
779    described in a paper "Fast Algorithms for
780    Sorting and Searching Strings", by Robert
781    Sedgewick and Jon L. Bentley.
782 --*/
783 
784     private static void vswap(final int[] fmap, int p1, int p2, int n) {
785         n += p1;
786         while (p1 < n) {
787             final int t = fmap[p1];
788             fmap[p1++] = fmap[p2];
789             fmap[p2++] = t;
790         }
791     }
792 
793     private static byte med3(final byte a, final byte b, final byte c) {
794         return (a < b) ? (b < c ? b : a < c ? c : a) : (b > c ? b : a > c ? c
795                                                         : a);
796     }
797 
798     private static final int SMALL_THRESH = 20;
799     private static final int DEPTH_THRESH = 10;
800     private static final int WORK_FACTOR = 30;
801 
802     /**
803      * Method "mainQSort3", file "blocksort.c", BZip2 1.0.2
804      */
805     private void mainQSort3(final BZip2CompressorOutputStream.Data dataShadow,
806                             final int loSt, final int hiSt, final int dSt,
807                             final int last) {
808         final int[] stack_ll = this.stack_ll;
809         final int[] stack_hh = this.stack_hh;
810         final int[] stack_dd = this.stack_dd;
811         final int[] fmap = dataShadow.fmap;
812         final byte[] block = dataShadow.block;
813 
814         stack_ll[0] = loSt;
815         stack_hh[0] = hiSt;
816         stack_dd[0] = dSt;
817 
818         for (int sp = 1; --sp >= 0;) {
819             final int lo = stack_ll[sp];
820             final int hi = stack_hh[sp];
821             final int d = stack_dd[sp];
822 
823             if ((hi - lo < SMALL_THRESH) || (d > DEPTH_THRESH)) {
824                 if (mainSimpleSort(dataShadow, lo, hi, d, last)) {
825                     return;
826                 }
827             } else {
828                 final int d1 = d + 1;
829                 final int med = med3(block[fmap[lo] + d1],
830                                      block[fmap[hi] + d1], block[fmap[(lo + hi) >>> 1] + d1]) & 0xff;
831 
832                 int unLo = lo;
833                 int unHi = hi;
834                 int ltLo = lo;
835                 int gtHi = hi;
836 
837                 while (true) {
838                     while (unLo <= unHi) {
839                         final int n = (block[fmap[unLo] + d1] & 0xff)
840                             - med;
841                         if (n == 0) {
842                             final int temp = fmap[unLo];
843                             fmap[unLo++] = fmap[ltLo];
844                             fmap[ltLo++] = temp;
845                         } else if (n < 0) {
846                             unLo++;
847                         } else {
848                             break;
849                         }
850                     }
851 
852                     while (unLo <= unHi) {
853                         final int n = (block[fmap[unHi] + d1] & 0xff)
854                             - med;
855                         if (n == 0) {
856                             final int temp = fmap[unHi];
857                             fmap[unHi--] = fmap[gtHi];
858                             fmap[gtHi--] = temp;
859                         } else if (n > 0) {
860                             unHi--;
861                         } else {
862                             break;
863                         }
864                     }
865 
866                     if (unLo <= unHi) {
867                         final int temp = fmap[unLo];
868                         fmap[unLo++] = fmap[unHi];
869                         fmap[unHi--] = temp;
870                     } else {
871                         break;
872                     }
873                 }
874 
875                 if (gtHi < ltLo) {
876                     stack_ll[sp] = lo;
877                     stack_hh[sp] = hi;
878                     stack_dd[sp] = d1;
879                     sp++;
880                 } else {
881                     int n = ((ltLo - lo) < (unLo - ltLo)) ? (ltLo - lo)
882                         : (unLo - ltLo);
883                     vswap(fmap, lo, unLo - n, n);
884                     int m = ((hi - gtHi) < (gtHi - unHi)) ? (hi - gtHi)
885                         : (gtHi - unHi);
886                     vswap(fmap, unLo, hi - m + 1, m);
887 
888                     n = lo + unLo - ltLo - 1;
889                     m = hi - (gtHi - unHi) + 1;
890 
891                     stack_ll[sp] = lo;
892                     stack_hh[sp] = n;
893                     stack_dd[sp] = d;
894                     sp++;
895 
896                     stack_ll[sp] = n + 1;
897                     stack_hh[sp] = m - 1;
898                     stack_dd[sp] = d1;
899                     sp++;
900 
901                     stack_ll[sp] = m;
902                     stack_hh[sp] = hi;
903                     stack_dd[sp] = d;
904                     sp++;
905                 }
906             }
907         }
908     }
909 
910     private static final int SETMASK = (1 << 21);
911     private static final int CLEARMASK = (~SETMASK);
912 
913     final void mainSort(final BZip2CompressorOutputStream.Data dataShadow,
914                         final int lastShadow) {
915         final int[] runningOrder = this.mainSort_runningOrder;
916         final int[] copy = this.mainSort_copy;
917         final boolean[] bigDone = this.mainSort_bigDone;
918         final int[] ftab = this.ftab;
919         final byte[] block = dataShadow.block;
920         final int[] fmap = dataShadow.fmap;
921         final char[] quadrant = this.quadrant;
922         final int workLimitShadow = this.workLimit;
923         final boolean firstAttemptShadow = this.firstAttempt;
924 
925         // LBZ2: Set up the 2-byte frequency table
926         for (int i = 65537; --i >= 0;) {
927             ftab[i] = 0;
928         }
929 
930         /*
931          * In the various block-sized structures, live data runs from 0 to
932          * last+NUM_OVERSHOOT_BYTES inclusive. First, set up the overshoot area
933          * for block.
934          */
935         for (int i = 0; i < BZip2Constants.NUM_OVERSHOOT_BYTES; i++) {
936             block[lastShadow + i + 2] = block[(i % (lastShadow + 1)) + 1];
937         }
938         for (int i = lastShadow + BZip2Constants.NUM_OVERSHOOT_BYTES +1; --i >= 0;) {
939             quadrant[i] = 0;
940         }
941         block[0] = block[lastShadow + 1];
942 
943         // LBZ2: Complete the initial radix sort:
944 
945         int c1 = block[0] & 0xff;
946         for (int i = 0; i <= lastShadow; i++) {
947             final int c2 = block[i + 1] & 0xff;
948             ftab[(c1 << 8) + c2]++;
949             c1 = c2;
950         }
951 
952         for (int i = 1; i <= 65536; i++) {
953             ftab[i] += ftab[i - 1];
954         }
955 
956         c1 = block[1] & 0xff;
957         for (int i = 0; i < lastShadow; i++) {
958             final int c2 = block[i + 2] & 0xff;
959             fmap[--ftab[(c1 << 8) + c2]] = i;
960             c1 = c2;
961         }
962 
963         fmap[--ftab[((block[lastShadow + 1] & 0xff) << 8) + (block[1] & 0xff)]] = lastShadow;
964 
965         /*
966          * LBZ2: Now ftab contains the first loc of every small bucket. Calculate the
967          * running order, from smallest to largest big bucket.
968          */
969         for (int i = 256; --i >= 0;) {
970             bigDone[i] = false;
971             runningOrder[i] = i;
972         }
973 
974         for (int h = 364; h != 1;) {
975             h /= 3;
976             for (int i = h; i <= 255; i++) {
977                 final int vv = runningOrder[i];
978                 final int a = ftab[(vv + 1) << 8] - ftab[vv << 8];
979                 final int b = h - 1;
980                 int j = i;
981                 for (int ro = runningOrder[j - h]; (ftab[(ro + 1) << 8] - ftab[ro << 8]) > a; ro = runningOrder[j
982                                                                                                                 - h]) {
983                     runningOrder[j] = ro;
984                     j -= h;
985                     if (j <= b) {
986                         break;
987                     }
988                 }
989                 runningOrder[j] = vv;
990             }
991         }
992 
993         /*
994          * LBZ2: The main sorting loop.
995          */
996         for (int i = 0; i <= 255; i++) {
997             /*
998              * LBZ2: Process big buckets, starting with the least full.
999              */
1000             final int ss = runningOrder[i];
1001 
1002             // Step 1:
1003             /*
1004              * LBZ2: Complete the big bucket [ss] by quicksorting any unsorted small
1005              * buckets [ss, j]. Hopefully previous pointer-scanning phases have
1006              * already completed many of the small buckets [ss, j], so we don't
1007              * have to sort them at all.
1008              */
1009             for (int j = 0; j <= 255; j++) {
1010                 final int sb = (ss << 8) + j;
1011                 final int ftab_sb = ftab[sb];
1012                 if ((ftab_sb & SETMASK) != SETMASK) {
1013                     final int lo = ftab_sb & CLEARMASK;
1014                     final int hi = (ftab[sb + 1] & CLEARMASK) - 1;
1015                     if (hi > lo) {
1016                         mainQSort3(dataShadow, lo, hi, 2, lastShadow);
1017                         if (firstAttemptShadow
1018                             && (this.workDone > workLimitShadow)) {
1019                             return;
1020                         }
1021                     }
1022                     ftab[sb] = ftab_sb | SETMASK;
1023                 }
1024             }
1025 
1026             // Step 2:
1027             // LBZ2: Now scan this big bucket so as to synthesise the
1028             // sorted order for small buckets [t, ss] for all t != ss.
1029 
1030             for (int j = 0; j <= 255; j++) {
1031                 copy[j] = ftab[(j << 8) + ss] & CLEARMASK;
1032             }
1033 
1034             for (int j = ftab[ss << 8] & CLEARMASK, hj = (ftab[(ss + 1) << 8] & CLEARMASK); j < hj; j++) {
1035                 final int fmap_j = fmap[j];
1036                 c1 = block[fmap_j] & 0xff;
1037                 if (!bigDone[c1]) {
1038                     fmap[copy[c1]] = (fmap_j == 0) ? lastShadow : (fmap_j - 1);
1039                     copy[c1]++;
1040                 }
1041             }
1042 
1043             for (int j = 256; --j >= 0;) {
1044                 ftab[(j << 8) + ss] |= SETMASK;
1045             }
1046 
1047             // Step 3:
1048             /*
1049              * LBZ2: The ss big bucket is now done. Record this fact, and update the
1050              * quadrant descriptors. Remember to update quadrants in the
1051              * overshoot area too, if necessary. The "if (i < 255)" test merely
1052              * skips this updating for the last bucket processed, since updating
1053              * for the last bucket is pointless.
1054              */
1055             bigDone[ss] = true;
1056 
1057             if (i < 255) {
1058                 final int bbStart = ftab[ss << 8] & CLEARMASK;
1059                 final int bbSize = (ftab[(ss + 1) << 8] & CLEARMASK) - bbStart;
1060                 int shifts = 0;
1061 
1062                 while ((bbSize >> shifts) > 65534) {
1063                     shifts++;
1064                 }
1065 
1066                 for (int j = 0; j < bbSize; j++) {
1067                     final int a2update = fmap[bbStart + j];
1068                     final char qVal = (char) (j >> shifts);
1069                     quadrant[a2update] = qVal;
1070                     if (a2update < BZip2Constants.NUM_OVERSHOOT_BYTES) {
1071                         quadrant[a2update + lastShadow + 1] = qVal;
1072                     }
1073                 }
1074             }
1075 
1076         }
1077     }
1078 
1079 }