View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   * http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.commons.compress.compressors.bzip2;
20  
21  import java.util.BitSet;
22  
23  /**
24   * Encapsulates the Burrows-Wheeler sorting algorithm needed by {@link
25   * BZip2CompressorOutputStream}.
26   *
27   * <p>This class is based on a Java port of Julian Seward's
28   * blocksort.c in his libbzip2</p>
29   *
30   * <p>The Burrows-Wheeler transform is a reversible transform of the
31   * original data that is supposed to group similiar bytes close to
32   * each other.  The idea is to sort all permutations of the input and
33   * only keep the last byte of each permutation.  E.g. for "Commons
34   * Compress" you'd get:</p>
35   *
36   * <pre>
37   *  CompressCommons
38   * Commons Compress
39   * CompressCommons 
40   * essCommons Compr
41   * mmons CompressCo
42   * mons CompressCom
43   * mpressCommons Co
44   * ns CompressCommo
45   * ommons CompressC
46   * ompressCommons C
47   * ons CompressComm
48   * pressCommons Com
49   * ressCommons Comp
50   * s CompressCommon
51   * sCommons Compres
52   * ssCommons Compre
53   * </pre>
54   *
55   * <p>Which results in a new text "ss romooCCmmpnse", in adition the
56   * index of the first line that contained the original text is kept -
57   * in this case it is 1.  The idea is that in a long English text all
58   * permutations that start with "he" are likely suffixes of a "the" and
59   * thus they end in "t" leading to a larger block of "t"s that can
60   * better be compressed by the subsequent Move-to-Front, run-length
61   * und Huffman encoding steps.</p>
62   *
63   * <p>For more information see for example:</p>
64   * <ul>
65   *   <li><a
66   *   href="http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf">Burrows,
67   *   M. and Wheeler, D.: A Block-sorting Lossless Data Compression
68   *   Algorithm</a></li>
69   *   <li><a href="http://webglimpse.net/pubs/suffix.pdf">Manber, U. and
70   *   Myers, G.: Suffix arrays: A new method for on-line string
71   *   searches</a></li>
72   *   <li><a
73   *   href="http://www.cs.tufts.edu/~nr/comp150fp/archive/bob-sedgewick/fast-strings.pdf">Bentley,
74   *   J.L. and Sedgewick, R.: Fast Algorithms for Sorting and Searching
75   *   Strings</a></li>
76   * </ul>
77   *
78   * @NotThreadSafe
79   */
80  class BlockSort {
81  
82      /*
83       * Some of the constructs used in the C code cannot be ported
84       * literally to Java - for example macros, unsigned types.  Some
85       * code has been hand-tuned to improve performance.  In order to
86       * avoid memory pressure some structures are reused for several
87       * blocks and some memory is even shared between sorting and the
88       * MTF stage even though either algorithm uses it for its own
89       * purpose.
90       *
91       * Comments preserved from the actual C code are prefixed with
92       * "LBZ2:".
93       */
94  
95      /*
96       * 2012-05-20 Stefan Bodewig:
97       *
98       * This class seems to mix several revisions of libbzip2's code.
99       * The mainSort function and those used by it look closer to the
100      * 0.9.5 version but show some variations introduced later.  At
101      * the same time the logic of Compress 1.4 to randomize the block
102      * on bad input has been dropped after libbzip2 0.9.0 and replaced
103      * by a fallback sorting algorithm.
104      *
105      * I've added the fallbackSort function of 1.0.6 and tried to
106      * integrate it with the existing code without touching too much.
107      * I've also removed the now unused randomization code.
108      */
109 
110     /*
111      * LBZ2: If you are ever unlucky/improbable enough to get a stack
112      * overflow whilst sorting, increase the following constant and
113      * try again. In practice I have never seen the stack go above 27
114      * elems, so the following limit seems very generous.
115      */
116     private static final int QSORT_STACK_SIZE = 1000;
117 
118     private static final int FALLBACK_QSORT_STACK_SIZE = 100;
119 
120     private static final int STACK_SIZE =
121         QSORT_STACK_SIZE < FALLBACK_QSORT_STACK_SIZE
122         ? FALLBACK_QSORT_STACK_SIZE : QSORT_STACK_SIZE;
123 
124     /*
125      * Used when sorting. If too many long comparisons happen, we stop sorting,
126      * and use fallbackSort instead.
127      */
128     private int workDone;
129     private int workLimit;
130     private boolean firstAttempt;
131 
132     private final int[] stack_ll = new int[STACK_SIZE]; // 4000 byte
133     private final int[] stack_hh = new int[STACK_SIZE]; // 4000 byte
134     private final int[] stack_dd = new int[QSORT_STACK_SIZE]; // 4000 byte
135 
136     private final int[] mainSort_runningOrder = new int[256]; // 1024 byte
137     private final int[] mainSort_copy = new int[256]; // 1024 byte
138     private final boolean[] mainSort_bigDone = new boolean[256]; // 256 byte
139 
140     private final int[] ftab = new int[65537]; // 262148 byte
141 
142     /**
143      * Array instance identical to Data's sfmap, both are used only
144      * temporarily and indepently, so we do not need to allocate
145      * additional memory.
146      */
147     private final char[] quadrant;
148 
149     BlockSort(final BZip2CompressorOutputStream.Data data) {
150         this.quadrant = data.sfmap;
151     }
152 
153     void blockSort(final BZip2CompressorOutputStream.Data data, final int last) {
154         this.workLimit = WORK_FACTOR * last;
155         this.workDone = 0;
156         this.firstAttempt = true;
157 
158         if (last + 1 < 10000) {
159             fallbackSort(data, last);
160         } else {
161             mainSort(data, last);
162 
163             if (this.firstAttempt && (this.workDone > this.workLimit)) {
164                 fallbackSort(data, last);
165             }
166         }
167 
168         final int[] fmap = data.fmap;
169         data.origPtr = -1;
170         for (int i = 0; i <= last; i++) {
171             if (fmap[i] == 0) {
172                 data.origPtr = i;
173                 break;
174             }
175         }
176 
177         // assert (data.origPtr != -1) : data.origPtr;
178     }
179 
180     /**
181      * Adapt fallbackSort to the expected interface of the rest of the
182      * code, in particular deal with the fact that block starts at
183      * offset 1 (in libbzip2 1.0.6 it starts at 0).
184      */
185     final void fallbackSort(final BZip2CompressorOutputStream.Data data,
186                             final int last) {
187         data.block[0] = data.block[last + 1];
188         fallbackSort(data.fmap, data.block, last + 1);
189         for (int i = 0; i < last + 1; i++) {
190             --data.fmap[i];
191         }
192         for (int i = 0; i < last + 1; i++) {
193             if (data.fmap[i] == -1) {
194                 data.fmap[i] = last;
195                 break;
196             }
197         }
198     }
199 
200 /*---------------------------------------------*/
201 
202 /*---------------------------------------------*/
203 /*--- LBZ2: Fallback O(N log(N)^2) sorting        ---*/
204 /*--- algorithm, for repetitive blocks      ---*/
205 /*---------------------------------------------*/
206 
207     /*
208      * This is the fallback sorting algorithm libbzip2 1.0.6 uses for
209      * repetitive or very short inputs.
210      *
211      * The idea is inspired by Manber-Myers string suffix sorting
212      * algorithm.  First a bucket sort places each permutation of the
213      * block into a bucket based on its first byte.  Permutations are
214      * represented by pointers to their first character kept in
215      * (partially) sorted order inside the array ftab.
216      *
217      * The next step visits all buckets in order and performs a
218      * quicksort on all permutations of the bucket based on the index
219      * of the bucket the second byte of the permutation belongs to,
220      * thereby forming new buckets.  When arrived here the
221      * permutations are sorted up to the second character and we have
222      * buckets of permutations that are identical up to two
223      * characters.
224      *
225      * Repeat the step of quicksorting each bucket, now based on the
226      * bucket holding the sequence of the third and forth character
227      * leading to four byte buckets.  Repeat this doubling of bucket
228      * sizes until all buckets only contain single permutations or the
229      * bucket size exceeds the block size.
230      *
231      * I.e.
232      *
233      * "abraba" form three buckets for the chars "a", "b", and "r" in
234      * the first step with
235      *
236      * fmap = { 'a:' 5, 3, 0, 'b:' 4, 1, 'r', 2 }
237      *
238      * when looking at the bucket of "a"s the second characters are in
239      * the buckets that start with fmap-index 0 (rolled over), 3 and 3
240      * respectively, forming two new buckets "aa" and "ab", so we get
241      *
242      * fmap = { 'aa:' 5, 'ab:' 3, 0, 'ba:' 4, 'br': 1, 'ra:' 2 }
243      *
244      * since the last bucket only contained a single item it didn't
245      * have to be sorted at all.
246      *
247      * There now is just one bucket with more than one permutation
248      * that remains to be sorted.  For the permutation that starts
249      * with index 3 the third and forth char are in bucket 'aa' at
250      * index 0 and for the one starting at block index 0 they are in
251      * bucket 'ra' with sort index 5.  The fully sorted order then becomes.
252      *
253      * fmap = { 5, 3, 0, 4, 1, 2 }
254      * 
255      */
256 
257     /**
258      * @param fmap points to the index of the starting point of a
259      *        permutation inside the block of data in the current
260      *        partially sorted order
261      * @param eclass points from the index of a character inside the
262      *        block to the first index in fmap that contains the
263      *        bucket of its suffix that is sorted in this step.
264      * @param lo lower boundary of the fmap-interval to be sorted 
265      * @param hi upper boundary of the fmap-interval to be sorted 
266      */
267     private void fallbackSimpleSort(int[] fmap, 
268                                     int[] eclass, 
269                                     int lo, 
270                                     int hi) {
271         if (lo == hi) {
272             return;
273         }
274 
275         int j;
276         if (hi - lo > 3) {
277             for (int i = hi - 4; i >= lo; i--) {
278                 int tmp = fmap[i];
279                 int ec_tmp = eclass[tmp];
280                 for (j = i + 4; j <= hi && ec_tmp > eclass[fmap[j]];
281                      j += 4) {
282                     fmap[j - 4] = fmap[j];
283                 }
284                 fmap[j - 4] = tmp;
285             }
286         }
287 
288         for (int i = hi - 1; i >= lo; i--) {
289             int tmp = fmap[i];
290             int ec_tmp = eclass[tmp];
291             for (j = i + 1; j <= hi && ec_tmp > eclass[fmap[j]]; j++) {
292                 fmap[j - 1] = fmap[j];
293             }
294             fmap[j-1] = tmp;
295         }
296     }
297 
298     private static final int FALLBACK_QSORT_SMALL_THRESH = 10;
299 
300     /**
301      * swaps two values in fmap
302      */
303     private void fswap(int[] fmap, int zz1, int zz2) {
304         int zztmp = fmap[zz1];
305         fmap[zz1] = fmap[zz2];
306         fmap[zz2] = zztmp;
307     }
308 
309     /**
310      * swaps two intervals starting at yyp1 and yyp2 of length yyn inside fmap.
311      */
312     private void fvswap(int[] fmap, int yyp1, int yyp2, int yyn) {
313         while (yyn > 0) {
314             fswap(fmap, yyp1, yyp2);
315             yyp1++; yyp2++; yyn--;
316         }
317     }
318 
319     private int fmin(int a, int b) {
320         return a < b ? a : b;
321     }
322 
323     private void fpush(int sp, int lz, int hz) {
324         stack_ll[sp] = lz;
325         stack_hh[sp] = hz;
326     }
327 
328     private int[] fpop(int sp) {
329         return new int[] { stack_ll[sp], stack_hh[sp] };
330     }
331 
332     /**
333      * @param fmap points to the index of the starting point of a
334      *        permutation inside the block of data in the current
335      *        partially sorted order
336      * @param eclass points from the index of a character inside the
337      *        block to the first index in fmap that contains the
338      *        bucket of its suffix that is sorted in this step.
339      * @param loSt lower boundary of the fmap-interval to be sorted 
340      * @param hiSt upper boundary of the fmap-interval to be sorted 
341      */
342     private void fallbackQSort3(int[] fmap, 
343                                 int[] eclass, 
344                                 int loSt, 
345                                 int hiSt) {
346         int lo, unLo, ltLo, hi, unHi, gtHi, n;
347 
348         long r = 0;
349         int sp = 0;
350         fpush(sp++, loSt, hiSt);
351 
352         while (sp > 0) {
353             int[] s = fpop(--sp);
354             lo = s[0]; hi = s[1];
355 
356             if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
357                 fallbackSimpleSort(fmap, eclass, lo, hi);
358                 continue;
359             }
360 
361             /* LBZ2: Random partitioning.  Median of 3 sometimes fails to
362                avoid bad cases.  Median of 9 seems to help but 
363                looks rather expensive.  This too seems to work but
364                is cheaper.  Guidance for the magic constants 
365                7621 and 32768 is taken from Sedgewick's algorithms
366                book, chapter 35.
367             */
368             r = ((r * 7621) + 1) % 32768;
369             long r3 = r % 3, med;
370             if (r3 == 0) {
371                 med = eclass[fmap[lo]]; 
372             } else if (r3 == 1) {
373                 med = eclass[fmap[(lo + hi) >>> 1]];
374             } else {
375                 med = eclass[fmap[hi]];
376             }
377 
378             unLo = ltLo = lo;
379             unHi = gtHi = hi;
380 
381             // looks like the ternary partition attributed to Wegner
382             // in the cited Sedgewick paper
383             while (true) {
384                 while (true) {
385                     if (unLo > unHi) {
386                         break;
387                     }
388                     n = eclass[fmap[unLo]] - (int) med;
389                     if (n == 0) { 
390                         fswap(fmap, unLo, ltLo); 
391                         ltLo++; unLo++; 
392                         continue; 
393                     }
394                     if (n > 0) {
395                         break;
396                     }
397                     unLo++;
398                 }
399                 while (true) {
400                     if (unLo > unHi) {
401                         break;
402                     }
403                     n = eclass[fmap[unHi]] - (int) med;
404                     if (n == 0) {
405                         fswap(fmap, unHi, gtHi); 
406                         gtHi--; unHi--; 
407                         continue; 
408                     }
409                     if (n < 0) {
410                         break;
411                     }
412                     unHi--;
413                 }
414                 if (unLo > unHi) {
415                     break;
416                 }
417                 fswap(fmap, unLo, unHi); unLo++; unHi--;
418             }
419 
420             if (gtHi < ltLo) {
421                 continue;
422             }
423 
424             n = fmin(ltLo - lo, unLo - ltLo);
425             fvswap(fmap, lo, unLo - n, n);
426             int m = fmin(hi - gtHi, gtHi - unHi);
427             fvswap(fmap, unHi + 1, hi - m + 1, m);
428 
429             n = lo + unLo - ltLo - 1;
430             m = hi - (gtHi - unHi) + 1;
431 
432             if (n - lo > hi - m) {
433                 fpush(sp++, lo, n);
434                 fpush(sp++, m, hi);
435             } else {
436                 fpush(sp++, m, hi);
437                 fpush(sp++, lo, n);
438             }
439         }
440     }
441 
442 
443 /*---------------------------------------------*/
444 
445     private int[] eclass;
446 
447     private int[] getEclass() {
448         return eclass == null
449             ? (eclass = new int[quadrant.length / 2]) : eclass;
450     }
451 
452     /*
453      * The C code uses an array of ints (each int holding 32 flags) to
454      * represents the bucket-start flags (bhtab).  It also contains
455      * optimizations to skip over 32 consecutively set or
456      * consecutively unset bits on word boundaries at once.  For now
457      * I've chosen to use the simpler but potentially slower code
458      * using BitSet - also in the hope that using the BitSet#nextXXX
459      * methods may be fast enough.
460      */
461 
462     /**
463      * @param fmap points to the index of the starting point of a
464      *        permutation inside the block of data in the current
465      *        partially sorted order
466      * @param block the original data
467      * @param nblock size of the block
468      * @param off offset of first byte to sort in block
469      */
470     final void fallbackSort(int[] fmap, byte[] block, int nblock) {
471         final int[] ftab = new int[257];
472         int H, i, j, k, l, r, cc, cc1;
473         int nNotDone;
474         int nBhtab;
475         final int[] eclass = getEclass();
476 
477         for (i = 0; i < nblock; i++) {
478             eclass[i] = 0;
479         }
480         /*--
481           LBZ2: Initial 1-char radix sort to generate
482           initial fmap and initial BH bits.
483           --*/
484         for (i = 0; i < nblock; i++) {
485             ftab[block[i] & 0xff]++;
486         }
487         for (i = 1; i < 257;    i++) {
488             ftab[i] += ftab[i - 1];
489         }
490 
491         for (i = 0; i < nblock; i++) {
492             j = block[i] & 0xff;
493             k = ftab[j] - 1;
494             ftab[j] = k;
495             fmap[k] = i;
496         }
497 
498         nBhtab = 64 + nblock;
499         BitSet bhtab = new BitSet(nBhtab);
500         for (i = 0; i < 256; i++) {
501             bhtab.set(ftab[i]);
502         }
503 
504         /*--
505           LBZ2: Inductively refine the buckets.  Kind-of an
506           "exponential radix sort" (!), inspired by the
507           Manber-Myers suffix array construction algorithm.
508           --*/
509 
510         /*-- LBZ2: set sentinel bits for block-end detection --*/
511         for (i = 0; i < 32; i++) { 
512             bhtab.set(nblock + 2 * i);
513             bhtab.clear(nblock + 2 * i + 1);
514         }
515 
516         /*-- LBZ2: the log(N) loop --*/
517         H = 1;
518         while (true) {
519 
520             j = 0;
521             for (i = 0; i < nblock; i++) {
522                 if (bhtab.get(i)) {
523                     j = i;
524                 }
525                 k = fmap[i] - H;
526                 if (k < 0) {
527                     k += nblock;
528                 }
529                 eclass[k] = j;
530             }
531 
532             nNotDone = 0;
533             r = -1;
534             while (true) {
535 
536                 /*-- LBZ2: find the next non-singleton bucket --*/
537                 k = r + 1;
538                 k = bhtab.nextClearBit(k);
539                 l = k - 1;
540                 if (l >= nblock) {
541                     break;
542                 }
543                 k = bhtab.nextSetBit(k + 1);
544                 r = k - 1;
545                 if (r >= nblock) {
546                     break;
547                 }
548 
549                 /*-- LBZ2: now [l, r] bracket current bucket --*/
550                 if (r > l) {
551                     nNotDone += (r - l + 1);
552                     fallbackQSort3(fmap, eclass, l, r);
553 
554                     /*-- LBZ2: scan bucket and generate header bits-- */
555                     cc = -1;
556                     for (i = l; i <= r; i++) {
557                         cc1 = eclass[fmap[i]];
558                         if (cc != cc1) {
559                             bhtab.set(i);
560                             cc = cc1;
561                         }
562                     }
563                 }
564             }
565 
566             H *= 2;
567             if (H > nblock || nNotDone == 0) {
568                 break;
569             }
570         }
571     }
572 
573 /*---------------------------------------------*/
574 
575     /*
576      * LBZ2: Knuth's increments seem to work better than Incerpi-Sedgewick here.
577      * Possibly because the number of elems to sort is usually small, typically
578      * &lt;= 20.
579      */
580     private static final int[] INCS = { 1, 4, 13, 40, 121, 364, 1093, 3280,
581                                         9841, 29524, 88573, 265720, 797161,
582                                         2391484 };
583 
584     /**
585      * This is the most hammered method of this class.
586      *
587      * <p>
588      * This is the version using unrolled loops. Normally I never use such ones
589      * in Java code. The unrolling has shown a noticable performance improvement
590      * on JRE 1.4.2 (Linux i586 / HotSpot Client). Of course it depends on the
591      * JIT compiler of the vm.
592      * </p>
593      */
594     private boolean mainSimpleSort(final BZip2CompressorOutputStream.Data dataShadow,
595                                    final int lo, final int hi, final int d,
596                                    final int lastShadow) {
597         final int bigN = hi - lo + 1;
598         if (bigN < 2) {
599             return this.firstAttempt && (this.workDone > this.workLimit);
600         }
601 
602         int hp = 0;
603         while (INCS[hp] < bigN) {
604             hp++;
605         }
606 
607         final int[] fmap = dataShadow.fmap;
608         final char[] quadrant = this.quadrant;
609         final byte[] block = dataShadow.block;
610         final int lastPlus1 = lastShadow + 1;
611         final boolean firstAttemptShadow = this.firstAttempt;
612         final int workLimitShadow = this.workLimit;
613         int workDoneShadow = this.workDone;
614 
615         // Following block contains unrolled code which could be shortened by
616         // coding it in additional loops.
617 
618         HP: while (--hp >= 0) {
619             final int h = INCS[hp];
620             final int mj = lo + h - 1;
621 
622             for (int i = lo + h; i <= hi;) {
623                 // copy
624                 for (int k = 3; (i <= hi) && (--k >= 0); i++) {
625                     final int v = fmap[i];
626                     final int vd = v + d;
627                     int j = i;
628 
629                     // for (int a;
630                     // (j > mj) && mainGtU((a = fmap[j - h]) + d, vd,
631                     // block, quadrant, lastShadow);
632                     // j -= h) {
633                     // fmap[j] = a;
634                     // }
635                     //
636                     // unrolled version:
637 
638                     // start inline mainGTU
639                     boolean onceRunned = false;
640                     int a = 0;
641 
642                     HAMMER: while (true) {
643                         if (onceRunned) {
644                             fmap[j] = a;
645                             if ((j -= h) <= mj) {
646                                 break HAMMER;
647                             }
648                         } else {
649                             onceRunned = true;
650                         }
651 
652                         a = fmap[j - h];
653                         int i1 = a + d;
654                         int i2 = vd;
655 
656                         // following could be done in a loop, but
657                         // unrolled it for performance:
658                         if (block[i1 + 1] == block[i2 + 1]) {
659                             if (block[i1 + 2] == block[i2 + 2]) {
660                                 if (block[i1 + 3] == block[i2 + 3]) {
661                                     if (block[i1 + 4] == block[i2 + 4]) {
662                                         if (block[i1 + 5] == block[i2 + 5]) {
663                                             if (block[(i1 += 6)] == block[(i2 += 6)]) {
664                                                 int x = lastShadow;
665                                                 X: while (x > 0) {
666                                                     x -= 4;
667 
668                                                     if (block[i1 + 1] == block[i2 + 1]) {
669                                                         if (quadrant[i1] == quadrant[i2]) {
670                                                             if (block[i1 + 2] == block[i2 + 2]) {
671                                                                 if (quadrant[i1 + 1] == quadrant[i2 + 1]) {
672                                                                     if (block[i1 + 3] == block[i2 + 3]) {
673                                                                         if (quadrant[i1 + 2] == quadrant[i2 + 2]) {
674                                                                             if (block[i1 + 4] == block[i2 + 4]) {
675                                                                                 if (quadrant[i1 + 3] == quadrant[i2 + 3]) {
676                                                                                     if ((i1 += 4) >= lastPlus1) {
677                                                                                         i1 -= lastPlus1;
678                                                                                     }
679                                                                                     if ((i2 += 4) >= lastPlus1) {
680                                                                                         i2 -= lastPlus1;
681                                                                                     }
682                                                                                     workDoneShadow++;
683                                                                                     continue X;
684                                                                                 } else if ((quadrant[i1 + 3] > quadrant[i2 + 3])) {
685                                                                                     continue HAMMER;
686                                                                                 } else {
687                                                                                     break HAMMER;
688                                                                                 }
689                                                                             } else if ((block[i1 + 4] & 0xff) > (block[i2 + 4] & 0xff)) {
690                                                                                 continue HAMMER;
691                                                                             } else {
692                                                                                 break HAMMER;
693                                                                             }
694                                                                         } else if ((quadrant[i1 + 2] > quadrant[i2 + 2])) {
695                                                                             continue HAMMER;
696                                                                         } else {
697                                                                             break HAMMER;
698                                                                         }
699                                                                     } else if ((block[i1 + 3] & 0xff) > (block[i2 + 3] & 0xff)) {
700                                                                         continue HAMMER;
701                                                                     } else {
702                                                                         break HAMMER;
703                                                                     }
704                                                                 } else if ((quadrant[i1 + 1] > quadrant[i2 + 1])) {
705                                                                     continue HAMMER;
706                                                                 } else {
707                                                                     break HAMMER;
708                                                                 }
709                                                             } else if ((block[i1 + 2] & 0xff) > (block[i2 + 2] & 0xff)) {
710                                                                 continue HAMMER;
711                                                             } else {
712                                                                 break HAMMER;
713                                                             }
714                                                         } else if ((quadrant[i1] > quadrant[i2])) {
715                                                             continue HAMMER;
716                                                         } else {
717                                                             break HAMMER;
718                                                         }
719                                                     } else if ((block[i1 + 1] & 0xff) > (block[i2 + 1] & 0xff)) {
720                                                         continue HAMMER;
721                                                     } else {
722                                                         break HAMMER;
723                                                     }
724 
725                                                 }
726                                                 break HAMMER;
727                                             } // while x > 0
728                                             else {
729                                                 if ((block[i1] & 0xff) > (block[i2] & 0xff)) {
730                                                     continue HAMMER;
731                                                 } else {
732                                                     break HAMMER;
733                                                 }
734                                             }
735                                         } else if ((block[i1 + 5] & 0xff) > (block[i2 + 5] & 0xff)) {
736                                             continue HAMMER;
737                                         } else {
738                                             break HAMMER;
739                                         }
740                                     } else if ((block[i1 + 4] & 0xff) > (block[i2 + 4] & 0xff)) {
741                                         continue HAMMER;
742                                     } else {
743                                         break HAMMER;
744                                     }
745                                 } else if ((block[i1 + 3] & 0xff) > (block[i2 + 3] & 0xff)) {
746                                     continue HAMMER;
747                                 } else {
748                                     break HAMMER;
749                                 }
750                             } else if ((block[i1 + 2] & 0xff) > (block[i2 + 2] & 0xff)) {
751                                 continue HAMMER;
752                             } else {
753                                 break HAMMER;
754                             }
755                         } else if ((block[i1 + 1] & 0xff) > (block[i2 + 1] & 0xff)) {
756                             continue HAMMER;
757                         } else {
758                             break HAMMER;
759                         }
760 
761                     } // HAMMER
762                     // end inline mainGTU
763 
764                     fmap[j] = v;
765                 }
766 
767                 if (firstAttemptShadow && (i <= hi)
768                     && (workDoneShadow > workLimitShadow)) {
769                     break HP;
770                 }
771             }
772         }
773 
774         this.workDone = workDoneShadow;
775         return firstAttemptShadow && (workDoneShadow > workLimitShadow);
776     }
777 
778 /*--
779    LBZ2: The following is an implementation of
780    an elegant 3-way quicksort for strings,
781    described in a paper "Fast Algorithms for
782    Sorting and Searching Strings", by Robert
783    Sedgewick and Jon L. Bentley.
784 --*/
785 
786     private static void vswap(int[] fmap, int p1, int p2, int n) {
787         n += p1;
788         while (p1 < n) {
789             int t = fmap[p1];
790             fmap[p1++] = fmap[p2];
791             fmap[p2++] = t;
792         }
793     }
794 
795     private static byte med3(byte a, byte b, byte c) {
796         return (a < b) ? (b < c ? b : a < c ? c : a) : (b > c ? b : a > c ? c
797                                                         : a);
798     }
799 
800     private static final int SMALL_THRESH = 20;
801     private static final int DEPTH_THRESH = 10;
802     private static final int WORK_FACTOR = 30;
803 
804     /**
805      * Method "mainQSort3", file "blocksort.c", BZip2 1.0.2
806      */
807     private void mainQSort3(final BZip2CompressorOutputStream.Data dataShadow,
808                             final int loSt, final int hiSt, final int dSt,
809                             final int last) {
810         final int[] stack_ll = this.stack_ll;
811         final int[] stack_hh = this.stack_hh;
812         final int[] stack_dd = this.stack_dd;
813         final int[] fmap = dataShadow.fmap;
814         final byte[] block = dataShadow.block;
815 
816         stack_ll[0] = loSt;
817         stack_hh[0] = hiSt;
818         stack_dd[0] = dSt;
819 
820         for (int sp = 1; --sp >= 0;) {
821             final int lo = stack_ll[sp];
822             final int hi = stack_hh[sp];
823             final int d = stack_dd[sp];
824 
825             if ((hi - lo < SMALL_THRESH) || (d > DEPTH_THRESH)) {
826                 if (mainSimpleSort(dataShadow, lo, hi, d, last)) {
827                     return;
828                 }
829             } else {
830                 final int d1 = d + 1;
831                 final int med = med3(block[fmap[lo] + d1],
832                                      block[fmap[hi] + d1], block[fmap[(lo + hi) >>> 1] + d1]) & 0xff;
833 
834                 int unLo = lo;
835                 int unHi = hi;
836                 int ltLo = lo;
837                 int gtHi = hi;
838 
839                 while (true) {
840                     while (unLo <= unHi) {
841                         final int n = (block[fmap[unLo] + d1] & 0xff)
842                             - med;
843                         if (n == 0) {
844                             final int temp = fmap[unLo];
845                             fmap[unLo++] = fmap[ltLo];
846                             fmap[ltLo++] = temp;
847                         } else if (n < 0) {
848                             unLo++;
849                         } else {
850                             break;
851                         }
852                     }
853 
854                     while (unLo <= unHi) {
855                         final int n = (block[fmap[unHi] + d1] & 0xff)
856                             - med;
857                         if (n == 0) {
858                             final int temp = fmap[unHi];
859                             fmap[unHi--] = fmap[gtHi];
860                             fmap[gtHi--] = temp;
861                         } else if (n > 0) {
862                             unHi--;
863                         } else {
864                             break;
865                         }
866                     }
867 
868                     if (unLo <= unHi) {
869                         final int temp = fmap[unLo];
870                         fmap[unLo++] = fmap[unHi];
871                         fmap[unHi--] = temp;
872                     } else {
873                         break;
874                     }
875                 }
876 
877                 if (gtHi < ltLo) {
878                     stack_ll[sp] = lo;
879                     stack_hh[sp] = hi;
880                     stack_dd[sp] = d1;
881                     sp++;
882                 } else {
883                     int n = ((ltLo - lo) < (unLo - ltLo)) ? (ltLo - lo)
884                         : (unLo - ltLo);
885                     vswap(fmap, lo, unLo - n, n);
886                     int m = ((hi - gtHi) < (gtHi - unHi)) ? (hi - gtHi)
887                         : (gtHi - unHi);
888                     vswap(fmap, unLo, hi - m + 1, m);
889 
890                     n = lo + unLo - ltLo - 1;
891                     m = hi - (gtHi - unHi) + 1;
892 
893                     stack_ll[sp] = lo;
894                     stack_hh[sp] = n;
895                     stack_dd[sp] = d;
896                     sp++;
897 
898                     stack_ll[sp] = n + 1;
899                     stack_hh[sp] = m - 1;
900                     stack_dd[sp] = d1;
901                     sp++;
902 
903                     stack_ll[sp] = m;
904                     stack_hh[sp] = hi;
905                     stack_dd[sp] = d;
906                     sp++;
907                 }
908             }
909         }
910     }
911 
912     private static final int SETMASK = (1 << 21);
913     private static final int CLEARMASK = (~SETMASK);
914 
915     final void mainSort(final BZip2CompressorOutputStream.Data dataShadow,
916                         final int lastShadow) {
917         final int[] runningOrder = this.mainSort_runningOrder;
918         final int[] copy = this.mainSort_copy;
919         final boolean[] bigDone = this.mainSort_bigDone;
920         final int[] ftab = this.ftab;
921         final byte[] block = dataShadow.block;
922         final int[] fmap = dataShadow.fmap;
923         final char[] quadrant = this.quadrant;
924         final int workLimitShadow = this.workLimit;
925         final boolean firstAttemptShadow = this.firstAttempt;
926 
927         // LBZ2: Set up the 2-byte frequency table
928         for (int i = 65537; --i >= 0;) {
929             ftab[i] = 0;
930         }
931 
932         /*
933          * In the various block-sized structures, live data runs from 0 to
934          * last+NUM_OVERSHOOT_BYTES inclusive. First, set up the overshoot area
935          * for block.
936          */
937         for (int i = 0; i < BZip2Constants.NUM_OVERSHOOT_BYTES; i++) {
938             block[lastShadow + i + 2] = block[(i % (lastShadow + 1)) + 1];
939         }
940         for (int i = lastShadow + BZip2Constants.NUM_OVERSHOOT_BYTES +1; --i >= 0;) {
941             quadrant[i] = 0;
942         }
943         block[0] = block[lastShadow + 1];
944 
945         // LBZ2: Complete the initial radix sort:
946 
947         int c1 = block[0] & 0xff;
948         for (int i = 0; i <= lastShadow; i++) {
949             final int c2 = block[i + 1] & 0xff;
950             ftab[(c1 << 8) + c2]++;
951             c1 = c2;
952         }
953 
954         for (int i = 1; i <= 65536; i++) {
955             ftab[i] += ftab[i - 1];
956         }
957 
958         c1 = block[1] & 0xff;
959         for (int i = 0; i < lastShadow; i++) {
960             final int c2 = block[i + 2] & 0xff;
961             fmap[--ftab[(c1 << 8) + c2]] = i;
962             c1 = c2;
963         }
964 
965         fmap[--ftab[((block[lastShadow + 1] & 0xff) << 8) + (block[1] & 0xff)]] = lastShadow;
966 
967         /*
968          * LBZ2: Now ftab contains the first loc of every small bucket. Calculate the
969          * running order, from smallest to largest big bucket.
970          */
971         for (int i = 256; --i >= 0;) {
972             bigDone[i] = false;
973             runningOrder[i] = i;
974         }
975 
976         for (int h = 364; h != 1;) {
977             h /= 3;
978             for (int i = h; i <= 255; i++) {
979                 final int vv = runningOrder[i];
980                 final int a = ftab[(vv + 1) << 8] - ftab[vv << 8];
981                 final int b = h - 1;
982                 int j = i;
983                 for (int ro = runningOrder[j - h]; (ftab[(ro + 1) << 8] - ftab[ro << 8]) > a; ro = runningOrder[j
984                                                                                                                 - h]) {
985                     runningOrder[j] = ro;
986                     j -= h;
987                     if (j <= b) {
988                         break;
989                     }
990                 }
991                 runningOrder[j] = vv;
992             }
993         }
994 
995         /*
996          * LBZ2: The main sorting loop.
997          */
998         for (int i = 0; i <= 255; i++) {
999             /*
1000              * LBZ2: Process big buckets, starting with the least full.
1001              */
1002             final int ss = runningOrder[i];
1003 
1004             // Step 1:
1005             /*
1006              * LBZ2: Complete the big bucket [ss] by quicksorting any unsorted small
1007              * buckets [ss, j]. Hopefully previous pointer-scanning phases have
1008              * already completed many of the small buckets [ss, j], so we don't
1009              * have to sort them at all.
1010              */
1011             for (int j = 0; j <= 255; j++) {
1012                 final int sb = (ss << 8) + j;
1013                 final int ftab_sb = ftab[sb];
1014                 if ((ftab_sb & SETMASK) != SETMASK) {
1015                     final int lo = ftab_sb & CLEARMASK;
1016                     final int hi = (ftab[sb + 1] & CLEARMASK) - 1;
1017                     if (hi > lo) {
1018                         mainQSort3(dataShadow, lo, hi, 2, lastShadow);
1019                         if (firstAttemptShadow
1020                             && (this.workDone > workLimitShadow)) {
1021                             return;
1022                         }
1023                     }
1024                     ftab[sb] = ftab_sb | SETMASK;
1025                 }
1026             }
1027 
1028             // Step 2:
1029             // LBZ2: Now scan this big bucket so as to synthesise the
1030             // sorted order for small buckets [t, ss] for all t != ss.
1031 
1032             for (int j = 0; j <= 255; j++) {
1033                 copy[j] = ftab[(j << 8) + ss] & CLEARMASK;
1034             }
1035 
1036             for (int j = ftab[ss << 8] & CLEARMASK, hj = (ftab[(ss + 1) << 8] & CLEARMASK); j < hj; j++) {
1037                 final int fmap_j = fmap[j];
1038                 c1 = block[fmap_j] & 0xff;
1039                 if (!bigDone[c1]) {
1040                     fmap[copy[c1]] = (fmap_j == 0) ? lastShadow : (fmap_j - 1);
1041                     copy[c1]++;
1042                 }
1043             }
1044 
1045             for (int j = 256; --j >= 0;) {
1046                 ftab[(j << 8) + ss] |= SETMASK;
1047             }
1048 
1049             // Step 3:
1050             /*
1051              * LBZ2: The ss big bucket is now done. Record this fact, and update the
1052              * quadrant descriptors. Remember to update quadrants in the
1053              * overshoot area too, if necessary. The "if (i < 255)" test merely
1054              * skips this updating for the last bucket processed, since updating
1055              * for the last bucket is pointless.
1056              */
1057             bigDone[ss] = true;
1058 
1059             if (i < 255) {
1060                 final int bbStart = ftab[ss << 8] & CLEARMASK;
1061                 final int bbSize = (ftab[(ss + 1) << 8] & CLEARMASK) - bbStart;
1062                 int shifts = 0;
1063 
1064                 while ((bbSize >> shifts) > 65534) {
1065                     shifts++;
1066                 }
1067 
1068                 for (int j = 0; j < bbSize; j++) {
1069                     final int a2update = fmap[bbStart + j];
1070                     final char qVal = (char) (j >> shifts);
1071                     quadrant[a2update] = qVal;
1072                     if (a2update < BZip2Constants.NUM_OVERSHOOT_BYTES) {
1073                         quadrant[a2update + lastShadow + 1] = qVal;
1074                     }
1075                 }
1076             }
1077 
1078         }
1079     }
1080 
1081 }