View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   https://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.commons.compress.compressors.lz77support;
20  
21  import java.io.IOException;
22  import java.util.Objects;
23  
24  import org.apache.commons.lang3.ArrayFill;
25  
26  /**
27   * Helper class for compression algorithms that use the ideas of LZ77.
28   *
29   * <p>
30   * Most LZ77 derived algorithms split input data into blocks of uncompressed data (called literal blocks) and back-references (pairs of offsets and lengths)
31   * that state "add {@code length} bytes that are the same as those already written starting {@code offset} bytes before the current position. The details of how
32   * those blocks and back-references are encoded are quite different between the algorithms and some algorithms perform additional steps (Huffman encoding in the
33   * case of DEFLATE for example).
34   * </p>
35   *
36   * <p>
37   * This class attempts to extract the core logic - finding back-references - so it can be re-used. It follows the algorithm explained in section 4 of RFC 1951
38   * (DEFLATE) and currently doesn't implement the "lazy match" optimization. The three-byte hash function used in this class is the same as the one used by zlib
39   * and InfoZIP's ZIP implementation of DEFLATE. The whole class is strongly inspired by InfoZIP's implementation.
40   * </p>
41   *
42   * <p>
43   * LZ77 is used vaguely here (as well as many other places that talk about it :-), LZSS would likely be closer to the truth but LZ77 has become the synonym for
44   * a whole family of algorithms.
45   * </p>
46   *
47   * <p>
48   * The API consists of a compressor that is fed {@code byte}s and emits {@link Block}s to a registered callback where the blocks represent either
49   * {@link LiteralBlock literal blocks}, {@link BackReference back-references} or {@link EOD end of data markers}. In order to ensure the callback receives all
50   * information, the {@code #finish} method must be used once all data has been fed into the compressor.
51   * </p>
52   *
53   * <p>
54   * Several parameters influence the outcome of the "compression":
55   * </p>
56   * <dl>
57   *
58   * <dt>{@code windowSize}</dt>
59   * <dd>the size of the sliding window, must be a power of two - this determines the maximum offset a back-reference can take. The compressor maintains a buffer
60   * of twice of {@code windowSize} - real world values are in the area of 32k.</dd>
61   *
62   * <dt>{@code minBackReferenceLength}</dt>
63   * <dd>Minimal length of a back-reference found. A true minimum of 3 is hard-coded inside of this implementation but bigger lengths can be configured.</dd>
64   *
65   * <dt>{@code maxBackReferenceLength}</dt>
66   * <dd>Maximal length of a back-reference found.</dd>
67   *
68   * <dt>{@code maxOffset}</dt>
69   * <dd>Maximal offset of a back-reference.</dd>
70   *
71   * <dt>{@code maxLiteralLength}</dt>
72   * <dd>Maximal length of a literal block.</dd>
73   * </dl>
74   *
75   * @see "https://tools.ietf.org/html/rfc1951#section-4"
76   * @since 1.14
77   * @NotThreadSafe
78   */
79  public class LZ77Compressor {
80  
81      /**
82       * Represents a back-reference.
83       */
84      public abstract static class AbstractReference extends Block {
85  
86          private final int offset;
87          private final int length;
88  
89          /**
90           * Constructs a new instance.
91           *
92           * @param blockType The block type.
93           * @param offset the offset of the reference.
94           * @param length the offset of the reference.
95           */
96          public AbstractReference(final BlockType blockType, final int offset, final int length) {
97              super(blockType);
98              this.offset = offset;
99              this.length = length;
100         }
101 
102         /**
103          * Gets the offset of the reference.
104          *
105          * @return the length
106          */
107         public int getLength() {
108             return length;
109         }
110 
111         /**
112          * Gets the offset of the reference.
113          *
114          * @return the offset
115          */
116         public int getOffset() {
117             return offset;
118         }
119 
120         @Override
121         public String toString() {
122             return super.toString() + " with offset " + offset + " and length " + length;
123         }
124     }
125 
126     /**
127      * Represents a back-reference.
128      */
129     public static final class BackReference extends AbstractReference {
130 
131         /**
132          * Constructs a new instance.
133          *
134          * @param offset the offset of the back-reference.
135          * @param length the offset of the back-reference.
136          */
137         public BackReference(final int offset, final int length) {
138             super(BlockType.BACK_REFERENCE, offset, length);
139         }
140 
141     }
142 
143     /**
144      * Base class representing blocks the compressor may emit.
145      *
146      * <p>
147      * This class is not supposed to be subclassed by classes outside of Commons Compress so it is considered internal and changed that would break subclasses
148      * may get introduced with future releases.
149      * </p>
150      */
151     public abstract static class Block {
152 
153         /**
154          * Enumerates the block types the compressor emits.
155          */
156         public enum BlockType {
157 
158             /**
159              * The literal block type.
160              */
161             LITERAL,
162 
163             /**
164              * The back-reference block type.
165              */
166             BACK_REFERENCE,
167 
168             /**
169              * The end-of-data block type.
170              */
171             EOD
172         }
173 
174         private final BlockType type;
175 
176         /**
177          * Constructs a new typeless instance.
178          *
179          * @deprecated Use {@link #Block()}.
180          */
181         @Deprecated
182         public Block() {
183             this.type = null;
184         }
185 
186         /**
187          * Constructs a new instance.
188          *
189          * @param type the block type, may not be {@code null}.
190          */
191         protected Block(final BlockType type) {
192             this.type = Objects.requireNonNull(type);
193         }
194 
195         /**
196          * Gets the the block type.
197          *
198          * @return the the block type.
199          */
200         public BlockType getType() {
201             return type;
202         }
203 
204         @Override
205         public String toString() {
206             return getClass().getSimpleName() + " " + getType();
207         }
208     }
209 
210     /**
211      * Callback invoked while the compressor processes data.
212      *
213      * <p>
214      * The callback is invoked on the same thread that receives the bytes to compress and may be invoked multiple times during the execution of
215      * {@link #compress} or {@link #finish}.
216      * </p>
217      */
218     public interface Callback {
219 
220         /**
221          * Consumes a block.
222          *
223          * @param b the block to consume
224          * @throws IOException in case of an error
225          */
226         void accept(Block b) throws IOException;
227     }
228 
229     /** A simple "we are done" marker. */
230     public static final class EOD extends Block {
231 
232         /**
233          * The singleton instance.
234          */
235         private static final EOD INSTANCE = new EOD();
236 
237         /**
238          * Constructs a new instance.
239          */
240         public EOD() {
241             super(BlockType.EOD);
242         }
243 
244     }
245 
246     /**
247      * Represents a literal block of data.
248      *
249      * <p>
250      * For performance reasons this encapsulates the real data, not a copy of it. Don't modify the data and process it inside of {@link Callback#accept}
251      * immediately as it will get overwritten sooner or later.
252      * </p>
253      */
254     public static final class LiteralBlock extends AbstractReference {
255 
256         private final byte[] data;
257 
258         /**
259          * Constructs a new instance.
260          *
261          * @param data the literal data.
262          * @param offset the length of literal block.
263          * @param length the length of literal block.
264          */
265         public LiteralBlock(final byte[] data, final int offset, final int length) {
266             super(BlockType.LITERAL, offset, length);
267             this.data = data;
268         }
269 
270         /**
271          * Gets the literal data.
272          *
273          * <p>
274          * This returns a live view of the actual data in order to avoid copying, modify the array at your own risk.
275          * </p>
276          *
277          * @return the data
278          */
279         public byte[] getData() {
280             return data;
281         }
282 
283     }
284 
285     static final int NUMBER_OF_BYTES_IN_HASH = 3;
286     private static final int NO_MATCH = -1;
287 
288     // we use a 15 bit hash code as calculated in updateHash
289     private static final int HASH_SIZE = 1 << 15;
290     private static final int HASH_MASK = HASH_SIZE - 1;
291 
292     private static final int H_SHIFT = 5;
293     private final Parameters params;
294     private final Callback callback;
295 
296     // the sliding window, twice as big as "windowSize" parameter
297     private final byte[] window;
298 
299     // the head of hash-chain - indexed by hash-code, points to the
300     // location inside of window of the latest sequence of bytes with
301     // the given hash.
302     private final int[] head;
303     // for each window-location points to the latest earlier location
304     // with the same hash. Only stores values for the latest
305     // "windowSize" elements, the index is "window location modulo
306     // windowSize".
307     private final int[] prev;
308     // bit mask used when indexing into prev
309     private final int wMask;
310     private boolean initialized;
311     // the position inside of window that shall be encoded right now
312     private int currentPosition;
313     // the number of bytes available to compress including the one at
314     // currentPosition
315     private int lookahead;
316     // the hash of the three bytes stating at the current position
317     private int insertHash;
318 
319     // the position inside the window where the current literal
320     // block starts (in case we are inside a literal block).
321     private int blockStart;
322 
323     // position of the current match
324     private int matchStart = NO_MATCH;
325 
326     // number of missed insertString calls for the up to three last
327     // bytes of the last match that can only be performed once more
328     // data has been read
329     private int missedInserts;
330 
331     /**
332      * Initializes a compressor with parameters and a callback.
333      *
334      * @param params   the parameters
335      * @param callback the callback
336      * @throws NullPointerException if either parameter is {@code null}
337      */
338     public LZ77Compressor(final Parameters params, final Callback callback) {
339         Objects.requireNonNull(params, "params");
340         Objects.requireNonNull(callback, "callback");
341 
342         this.params = params;
343         this.callback = callback;
344 
345         final int wSize = params.getWindowSize();
346         window = new byte[wSize * 2];
347         wMask = wSize - 1;
348         head = ArrayFill.fill(new int[HASH_SIZE], NO_MATCH);
349         prev = new int[wSize];
350     }
351 
352     private void catchUpMissedInserts() {
353         while (missedInserts > 0) {
354             insertString(currentPosition - missedInserts--);
355         }
356     }
357 
358     private void compress() throws IOException {
359         final int minMatch = params.getMinBackReferenceLength();
360         final boolean lazy = params.getLazyMatching();
361         final int lazyThreshold = params.getLazyMatchingThreshold();
362 
363         while (lookahead >= minMatch) {
364             catchUpMissedInserts();
365             int matchLength = 0;
366             final int hashHead = insertString(currentPosition);
367             if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) {
368                 // sets matchStart as a side effect
369                 matchLength = longestMatch(hashHead);
370 
371                 if (lazy && matchLength <= lazyThreshold && lookahead > minMatch) {
372                     // try to find a longer match using the next position
373                     matchLength = longestMatchForNextPosition(matchLength);
374                 }
375             }
376             if (matchLength >= minMatch) {
377                 if (blockStart != currentPosition) {
378                     // emit preceding literal block
379                     flushLiteralBlock();
380                     blockStart = NO_MATCH;
381                 }
382                 flushBackReference(matchLength);
383                 insertStringsInMatch(matchLength);
384                 lookahead -= matchLength;
385                 currentPosition += matchLength;
386                 blockStart = currentPosition;
387             } else {
388                 // no match, append to current or start a new literal
389                 lookahead--;
390                 currentPosition++;
391                 if (currentPosition - blockStart >= params.getMaxLiteralLength()) {
392                     flushLiteralBlock();
393                     blockStart = currentPosition;
394                 }
395             }
396         }
397     }
398 
399     /**
400      * Feeds bytes into the compressor which in turn may emit zero or more blocks to the callback during the execution of this method.
401      *
402      * @param data the data to compress - must not be null
403      * @throws IOException if the callback throws an exception
404      */
405     public void compress(final byte[] data) throws IOException {
406         compress(data, 0, data.length);
407     }
408 
409     /**
410      * Feeds bytes into the compressor which in turn may emit zero or more blocks to the callback during the execution of this method.
411      *
412      * @param data the data to compress - must not be null
413      * @param off  the start offset of the data
414      * @param len  the number of bytes to compress
415      * @throws IOException if the callback throws an exception
416      */
417     public void compress(final byte[] data, int off, int len) throws IOException {
418         final int wSize = params.getWindowSize();
419         while (len > wSize) { // chop into windowSize sized chunks
420             doCompress(data, off, wSize);
421             off += wSize;
422             len -= wSize;
423         }
424         if (len > 0) {
425             doCompress(data, off, len);
426         }
427     }
428 
429     // performs the actual algorithm with the pre-condition len <= windowSize
430     private void doCompress(final byte[] data, final int off, final int len) throws IOException {
431         final int spaceLeft = window.length - currentPosition - lookahead;
432         if (len > spaceLeft) {
433             slide();
434         }
435         System.arraycopy(data, off, window, currentPosition + lookahead, len);
436         lookahead += len;
437         if (!initialized && lookahead >= params.getMinBackReferenceLength()) {
438             initialize();
439         }
440         if (initialized) {
441             compress();
442         }
443     }
444 
445     /**
446      * Tells the compressor to process all remaining data and signal end of data to the callback.
447      *
448      * <p>
449      * The compressor will in turn emit at least one block ({@link EOD}) but potentially multiple blocks to the callback during the execution of this method.
450      * </p>
451      *
452      * @throws IOException if the callback throws an exception
453      */
454     public void finish() throws IOException {
455         if (blockStart != currentPosition || lookahead > 0) {
456             currentPosition += lookahead;
457             flushLiteralBlock();
458         }
459         callback.accept(EOD.INSTANCE);
460     }
461 
462     private void flushBackReference(final int matchLength) throws IOException {
463         callback.accept(new BackReference(currentPosition - matchStart, matchLength));
464     }
465 
466     private void flushLiteralBlock() throws IOException {
467         callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart));
468     }
469 
470     private void initialize() {
471         for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) {
472             insertHash = nextHash(insertHash, window[i]);
473         }
474         initialized = true;
475     }
476 
477     /**
478      * Inserts the current three byte sequence into the dictionary and returns the previous head of the hash-chain.
479      *
480      * <p>
481      * Updates {@code insertHash} and {@code prev} as a side effect.
482      * </p>
483      */
484     private int insertString(final int pos) {
485         insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]);
486         final int hashHead = head[insertHash];
487         prev[pos & wMask] = hashHead;
488         head[insertHash] = pos;
489         return hashHead;
490     }
491 
492     private void insertStringsInMatch(final int matchLength) {
493         // inserts strings contained in current match
494         // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts
495         final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH);
496         // currentPosition has been inserted already
497         for (int i = 1; i <= stop; i++) {
498             insertString(currentPosition + i);
499         }
500         missedInserts = matchLength - stop - 1;
501     }
502 
503     /**
504      * Searches the hash chain for real matches and returns the length of the longest match (0 if none were found) that isn't too far away (WRT maxOffset).
505      *
506      * <p>
507      * Sets matchStart to the index of the start position of the longest match as a side effect.
508      * </p>
509      */
510     private int longestMatch(int matchHead) {
511         final int minLength = params.getMinBackReferenceLength();
512         int longestMatchLength = minLength - 1;
513         final int maxPossibleLength = Math.min(params.getMaxBackReferenceLength(), lookahead);
514         final int minIndex = Math.max(0, currentPosition - params.getMaxOffset());
515         final int niceBackReferenceLength = Math.min(maxPossibleLength, params.getNiceBackReferenceLength());
516         final int maxCandidates = params.getMaxCandidates();
517         for (int candidates = 0; candidates < maxCandidates && matchHead >= minIndex; candidates++) {
518             int currentLength = 0;
519             for (int i = 0; i < maxPossibleLength; i++) {
520                 if (window[matchHead + i] != window[currentPosition + i]) {
521                     break;
522                 }
523                 currentLength++;
524             }
525             if (currentLength > longestMatchLength) {
526                 longestMatchLength = currentLength;
527                 matchStart = matchHead;
528                 if (currentLength >= niceBackReferenceLength) {
529                     // no need to search any further
530                     break;
531                 }
532             }
533             matchHead = prev[matchHead & wMask];
534         }
535         return longestMatchLength; // < minLength if no matches have been found, will be ignored in compress()
536     }
537 
538     private int longestMatchForNextPosition(final int prevMatchLength) {
539         // save a bunch of values to restore them if the next match isn't better than the current one
540         final int prevMatchStart = matchStart;
541         final int prevInsertHash = insertHash;
542 
543         lookahead--;
544         currentPosition++;
545         final int hashHead = insertString(currentPosition);
546         final int prevHashHead = prev[currentPosition & wMask];
547         int matchLength = longestMatch(hashHead);
548 
549         if (matchLength <= prevMatchLength) {
550             // use the first match, as the next one isn't any better
551             matchLength = prevMatchLength;
552             matchStart = prevMatchStart;
553 
554             // restore modified values
555             head[insertHash] = prevHashHead;
556             insertHash = prevInsertHash;
557             currentPosition--;
558             lookahead++;
559         }
560         return matchLength;
561     }
562 
563     /**
564      * Assumes we are calculating the hash for three consecutive bytes as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC the new hash for BCD is
565      * nextHash(H, D).
566      *
567      * <p>
568      * The hash is shifted by five bits on each update so all effects of A have been swapped after the third update.
569      * </p>
570      */
571     private int nextHash(final int oldHash, final byte nextByte) {
572         final int nextVal = nextByte & 0xFF;
573         return (oldHash << H_SHIFT ^ nextVal) & HASH_MASK;
574     }
575 
576     /**
577      * Adds some initial data to fill the window with.
578      *
579      * <p>
580      * This is used if the stream has been cut into blocks and back-references of one block may refer to data of the previous block(s). One such example is the
581      * LZ4 frame format using block dependency.
582      * </p>
583      *
584      * @param data the data to fill the window with.
585      * @throws IllegalStateException if the compressor has already started to accept data
586      */
587     public void prefill(final byte[] data) {
588         if (currentPosition != 0 || lookahead != 0) {
589             throw new IllegalStateException("The compressor has already started to accept data, can't prefill anymore");
590         }
591 
592         // don't need more than windowSize for back-references
593         final int len = Math.min(params.getWindowSize(), data.length);
594         System.arraycopy(data, data.length - len, window, 0, len);
595 
596         if (len >= NUMBER_OF_BYTES_IN_HASH) {
597             initialize();
598             final int stop = len - NUMBER_OF_BYTES_IN_HASH + 1;
599             for (int i = 0; i < stop; i++) {
600                 insertString(i);
601             }
602             missedInserts = NUMBER_OF_BYTES_IN_HASH - 1;
603         } else { // not enough data to hash anything
604             missedInserts = len;
605         }
606         blockStart = currentPosition = len;
607     }
608 
609     private void slide() throws IOException {
610         final int wSize = params.getWindowSize();
611         if (blockStart != currentPosition && blockStart < wSize) {
612             flushLiteralBlock();
613             blockStart = currentPosition;
614         }
615         System.arraycopy(window, wSize, window, 0, wSize);
616         currentPosition -= wSize;
617         matchStart -= wSize;
618         blockStart -= wSize;
619         for (int i = 0; i < HASH_SIZE; i++) {
620             final int h = head[i];
621             head[i] = h >= wSize ? h - wSize : NO_MATCH;
622         }
623         for (int i = 0; i < wSize; i++) {
624             final int p = prev[i];
625             prev[i] = p >= wSize ? p - wSize : NO_MATCH;
626         }
627     }
628 }