View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   * http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.commons.compress.compressors.lz77support;
20  
21  import java.io.IOException;
22  import java.util.Arrays;
23  import java.util.Objects;
24  
25  /**
26   * Helper class for compression algorithms that use the ideas of LZ77.
27   *
28   * <p>
29   * Most LZ77 derived algorithms split input data into blocks of uncompressed data (called literal blocks) and back-references (pairs of offsets and lengths)
30   * that state "add {@code length} bytes that are the same as those already written starting {@code offset} bytes before the current position. The details of how
31   * those blocks and back-references are encoded are quite different between the algorithms and some algorithms perform additional steps (Huffman encoding in the
32   * case of DEFLATE for example).
33   * </p>
34   *
35   * <p>
36   * This class attempts to extract the core logic - finding back-references - so it can be re-used. It follows the algorithm explained in section 4 of RFC 1951
37   * (DEFLATE) and currently doesn't implement the "lazy match" optimization. The three-byte hash function used in this class is the same as the one used by zlib
38   * and InfoZIP's ZIP implementation of DEFLATE. The whole class is strongly inspired by InfoZIP's implementation.
39   * </p>
40   *
41   * <p>
42   * LZ77 is used vaguely here (as well as many other places that talk about it :-), LZSS would likely be closer to the truth but LZ77 has become the synonym for
43   * a whole family of algorithms.
44   * </p>
45   *
46   * <p>
47   * The API consists of a compressor that is fed {@code byte}s and emits {@link Block}s to a registered callback where the blocks represent either
48   * {@link LiteralBlock literal blocks}, {@link BackReference back-references} or {@link EOD end of data markers}. In order to ensure the callback receives all
49   * information, the {@code #finish} method must be used once all data has been fed into the compressor.
50   * </p>
51   *
52   * <p>
53   * Several parameters influence the outcome of the "compression":
54   * </p>
55   * <dl>
56   *
57   * <dt>{@code windowSize}</dt>
58   * <dd>the size of the sliding window, must be a power of two - this determines the maximum offset a back-reference can take. The compressor maintains a buffer
59   * of twice of {@code windowSize} - real world values are in the area of 32k.</dd>
60   *
61   * <dt>{@code minBackReferenceLength}</dt>
62   * <dd>Minimal length of a back-reference found. A true minimum of 3 is hard-coded inside of this implementation but bigger lengths can be configured.</dd>
63   *
64   * <dt>{@code maxBackReferenceLength}</dt>
65   * <dd>Maximal length of a back-reference found.</dd>
66   *
67   * <dt>{@code maxOffset}</dt>
68   * <dd>Maximal offset of a back-reference.</dd>
69   *
70   * <dt>{@code maxLiteralLength}</dt>
71   * <dd>Maximal length of a literal block.</dd>
72   * </dl>
73   *
74   * @see "https://tools.ietf.org/html/rfc1951#section-4"
75   * @since 1.14
76   * @NotThreadSafe
77   */
78  public class LZ77Compressor {
79  
80      /**
81       * Represents a back-reference.
82       */
83      public static final class BackReference extends Block {
84          private final int offset, length;
85  
86          public BackReference(final int offset, final int length) {
87              this.offset = offset;
88              this.length = length;
89          }
90  
91          /**
92           * Provides the length of the back-reference.
93           *
94           * @return the length
95           */
96          public int getLength() {
97              return length;
98          }
99  
100         /**
101          * Provides the offset of the back-reference.
102          *
103          * @return the offset
104          */
105         public int getOffset() {
106             return offset;
107         }
108 
109         @Override
110         public BlockType getType() {
111             return BlockType.BACK_REFERENCE;
112         }
113 
114         @Override
115         public String toString() {
116             return "BackReference with offset " + offset + " and length " + length;
117         }
118     }
119 
120     /**
121      * Base class representing blocks the compressor may emit.
122      *
123      * <p>
124      * This class is not supposed to be subclassed by classes outside of Commons Compress so it is considered internal and changed that would break subclasses
125      * may get introduced with future releases.
126      * </p>
127      */
128     public abstract static class Block {
129         /** Enumeration of the block types the compressor may emit. */
130         public enum BlockType {
131             LITERAL, BACK_REFERENCE, EOD
132         }
133 
134         public abstract BlockType getType();
135     }
136 
137     /**
138      * Callback invoked while the compressor processes data.
139      *
140      * <p>
141      * The callback is invoked on the same thread that receives the bytes to compress and may be invoked multiple times during the execution of
142      * {@link #compress} or {@link #finish}.
143      * </p>
144      */
145     public interface Callback {
146         /**
147          * Consumes a block.
148          *
149          * @param b the block to consume
150          * @throws IOException in case of an error
151          */
152         void accept(Block b) throws IOException;
153     }
154 
155     /** A simple "we are done" marker. */
156     public static final class EOD extends Block {
157         @Override
158         public BlockType getType() {
159             return BlockType.EOD;
160         }
161     }
162 
163     /**
164      * Represents a literal block of data.
165      *
166      * <p>
167      * For performance reasons this encapsulates the real data, not a copy of it. Don't modify the data and process it inside of {@link Callback#accept}
168      * immediately as it will get overwritten sooner or later.
169      * </p>
170      */
171     public static final class LiteralBlock extends Block {
172         private final byte[] data;
173         private final int offset, length;
174 
175         public LiteralBlock(final byte[] data, final int offset, final int length) {
176             this.data = data;
177             this.offset = offset;
178             this.length = length;
179         }
180 
181         /**
182          * The literal data.
183          *
184          * <p>
185          * This returns a life view of the actual data in order to avoid copying, modify the array at your own risk.
186          * </p>
187          *
188          * @return the data
189          */
190         public byte[] getData() {
191             return data;
192         }
193 
194         /**
195          * Length of literal block.
196          *
197          * @return the length
198          */
199         public int getLength() {
200             return length;
201         }
202 
203         /**
204          * Offset into data where the literal block starts.
205          *
206          * @return the offset
207          */
208         public int getOffset() {
209             return offset;
210         }
211 
212         @Override
213         public BlockType getType() {
214             return BlockType.LITERAL;
215         }
216 
217         @Override
218         public String toString() {
219             return "LiteralBlock starting at " + offset + " with length " + length;
220         }
221     }
222 
223     private static final Block THE_EOD = new EOD();
224 
225     static final int NUMBER_OF_BYTES_IN_HASH = 3;
226     private static final int NO_MATCH = -1;
227 
228     // we use a 15 bit hash code as calculated in updateHash
229     private static final int HASH_SIZE = 1 << 15;
230     private static final int HASH_MASK = HASH_SIZE - 1;
231 
232     private static final int H_SHIFT = 5;
233     private final Parameters params;
234     private final Callback callback;
235 
236     // the sliding window, twice as big as "windowSize" parameter
237     private final byte[] window;
238 
239     // the head of hash-chain - indexed by hash-code, points to the
240     // location inside of window of the latest sequence of bytes with
241     // the given hash.
242     private final int[] head;
243     // for each window-location points to the latest earlier location
244     // with the same hash. Only stores values for the latest
245     // "windowSize" elements, the index is "window location modulo
246     // windowSize".
247     private final int[] prev;
248     // bit mask used when indexing into prev
249     private final int wMask;
250     private boolean initialized;
251     // the position inside of window that shall be encoded right now
252     private int currentPosition;
253     // the number of bytes available to compress including the one at
254     // currentPosition
255     private int lookahead;
256     // the hash of the three bytes stating at the current position
257     private int insertHash;
258 
259     // the position inside the window where the current literal
260     // block starts (in case we are inside a literal block).
261     private int blockStart;
262 
263     // position of the current match
264     private int matchStart = NO_MATCH;
265 
266     // number of missed insertString calls for the up to three last
267     // bytes of the last match that can only be performed once more
268     // data has been read
269     private int missedInserts;
270 
271     /**
272      * Initializes a compressor with parameters and a callback.
273      *
274      * @param params   the parameters
275      * @param callback the callback
276      * @throws NullPointerException if either parameter is {@code null}
277      */
278     public LZ77Compressor(final Parameters params, final Callback callback) {
279         Objects.requireNonNull(params, "params");
280         Objects.requireNonNull(callback, "callback");
281 
282         this.params = params;
283         this.callback = callback;
284 
285         final int wSize = params.getWindowSize();
286         window = new byte[wSize * 2];
287         wMask = wSize - 1;
288         head = new int[HASH_SIZE];
289         Arrays.fill(head, NO_MATCH);
290         prev = new int[wSize];
291     }
292 
293     private void catchUpMissedInserts() {
294         while (missedInserts > 0) {
295             insertString(currentPosition - missedInserts--);
296         }
297     }
298 
299     private void compress() throws IOException {
300         final int minMatch = params.getMinBackReferenceLength();
301         final boolean lazy = params.getLazyMatching();
302         final int lazyThreshold = params.getLazyMatchingThreshold();
303 
304         while (lookahead >= minMatch) {
305             catchUpMissedInserts();
306             int matchLength = 0;
307             final int hashHead = insertString(currentPosition);
308             if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) {
309                 // sets matchStart as a side effect
310                 matchLength = longestMatch(hashHead);
311 
312                 if (lazy && matchLength <= lazyThreshold && lookahead > minMatch) {
313                     // try to find a longer match using the next position
314                     matchLength = longestMatchForNextPosition(matchLength);
315                 }
316             }
317             if (matchLength >= minMatch) {
318                 if (blockStart != currentPosition) {
319                     // emit preceding literal block
320                     flushLiteralBlock();
321                     blockStart = NO_MATCH;
322                 }
323                 flushBackReference(matchLength);
324                 insertStringsInMatch(matchLength);
325                 lookahead -= matchLength;
326                 currentPosition += matchLength;
327                 blockStart = currentPosition;
328             } else {
329                 // no match, append to current or start a new literal
330                 lookahead--;
331                 currentPosition++;
332                 if (currentPosition - blockStart >= params.getMaxLiteralLength()) {
333                     flushLiteralBlock();
334                     blockStart = currentPosition;
335                 }
336             }
337         }
338     }
339 
340     /**
341      * Feeds bytes into the compressor which in turn may emit zero or more blocks to the callback during the execution of this method.
342      *
343      * @param data the data to compress - must not be null
344      * @throws IOException if the callback throws an exception
345      */
346     public void compress(final byte[] data) throws IOException {
347         compress(data, 0, data.length);
348     }
349 
350     /**
351      * Feeds bytes into the compressor which in turn may emit zero or more blocks to the callback during the execution of this method.
352      *
353      * @param data the data to compress - must not be null
354      * @param off  the start offset of the data
355      * @param len  the number of bytes to compress
356      * @throws IOException if the callback throws an exception
357      */
358     public void compress(final byte[] data, int off, int len) throws IOException {
359         final int wSize = params.getWindowSize();
360         while (len > wSize) { // chop into windowSize sized chunks
361             doCompress(data, off, wSize);
362             off += wSize;
363             len -= wSize;
364         }
365         if (len > 0) {
366             doCompress(data, off, len);
367         }
368     }
369 
370     // performs the actual algorithm with the pre-condition len <= windowSize
371     private void doCompress(final byte[] data, final int off, final int len) throws IOException {
372         final int spaceLeft = window.length - currentPosition - lookahead;
373         if (len > spaceLeft) {
374             slide();
375         }
376         System.arraycopy(data, off, window, currentPosition + lookahead, len);
377         lookahead += len;
378         if (!initialized && lookahead >= params.getMinBackReferenceLength()) {
379             initialize();
380         }
381         if (initialized) {
382             compress();
383         }
384     }
385 
386     /**
387      * Tells the compressor to process all remaining data and signal end of data to the callback.
388      *
389      * <p>
390      * The compressor will in turn emit at least one block ({@link EOD}) but potentially multiple blocks to the callback during the execution of this method.
391      * </p>
392      *
393      * @throws IOException if the callback throws an exception
394      */
395     public void finish() throws IOException {
396         if (blockStart != currentPosition || lookahead > 0) {
397             currentPosition += lookahead;
398             flushLiteralBlock();
399         }
400         callback.accept(THE_EOD);
401     }
402 
403     private void flushBackReference(final int matchLength) throws IOException {
404         callback.accept(new BackReference(currentPosition - matchStart, matchLength));
405     }
406 
407     private void flushLiteralBlock() throws IOException {
408         callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart));
409     }
410 
411     private void initialize() {
412         for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) {
413             insertHash = nextHash(insertHash, window[i]);
414         }
415         initialized = true;
416     }
417 
418     /**
419      * Inserts the current three byte sequence into the dictionary and returns the previous head of the hash-chain.
420      *
421      * <p>
422      * Updates {@code insertHash} and {@code prev} as a side effect.
423      * </p>
424      */
425     private int insertString(final int pos) {
426         insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]);
427         final int hashHead = head[insertHash];
428         prev[pos & wMask] = hashHead;
429         head[insertHash] = pos;
430         return hashHead;
431     }
432 
433     private void insertStringsInMatch(final int matchLength) {
434         // inserts strings contained in current match
435         // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts
436         final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH);
437         // currentPosition has been inserted already
438         for (int i = 1; i <= stop; i++) {
439             insertString(currentPosition + i);
440         }
441         missedInserts = matchLength - stop - 1;
442     }
443 
444     /**
445      * Searches the hash chain for real matches and returns the length of the longest match (0 if none were found) that isn't too far away (WRT maxOffset).
446      *
447      * <p>
448      * Sets matchStart to the index of the start position of the longest match as a side effect.
449      * </p>
450      */
451     private int longestMatch(int matchHead) {
452         final int minLength = params.getMinBackReferenceLength();
453         int longestMatchLength = minLength - 1;
454         final int maxPossibleLength = Math.min(params.getMaxBackReferenceLength(), lookahead);
455         final int minIndex = Math.max(0, currentPosition - params.getMaxOffset());
456         final int niceBackReferenceLength = Math.min(maxPossibleLength, params.getNiceBackReferenceLength());
457         final int maxCandidates = params.getMaxCandidates();
458         for (int candidates = 0; candidates < maxCandidates && matchHead >= minIndex; candidates++) {
459             int currentLength = 0;
460             for (int i = 0; i < maxPossibleLength; i++) {
461                 if (window[matchHead + i] != window[currentPosition + i]) {
462                     break;
463                 }
464                 currentLength++;
465             }
466             if (currentLength > longestMatchLength) {
467                 longestMatchLength = currentLength;
468                 matchStart = matchHead;
469                 if (currentLength >= niceBackReferenceLength) {
470                     // no need to search any further
471                     break;
472                 }
473             }
474             matchHead = prev[matchHead & wMask];
475         }
476         return longestMatchLength; // < minLength if no matches have been found, will be ignored in compress()
477     }
478 
479     private int longestMatchForNextPosition(final int prevMatchLength) {
480         // save a bunch of values to restore them if the next match isn't better than the current one
481         final int prevMatchStart = matchStart;
482         final int prevInsertHash = insertHash;
483 
484         lookahead--;
485         currentPosition++;
486         final int hashHead = insertString(currentPosition);
487         final int prevHashHead = prev[currentPosition & wMask];
488         int matchLength = longestMatch(hashHead);
489 
490         if (matchLength <= prevMatchLength) {
491             // use the first match, as the next one isn't any better
492             matchLength = prevMatchLength;
493             matchStart = prevMatchStart;
494 
495             // restore modified values
496             head[insertHash] = prevHashHead;
497             insertHash = prevInsertHash;
498             currentPosition--;
499             lookahead++;
500         }
501         return matchLength;
502     }
503 
504     /**
505      * Assumes we are calculating the hash for three consecutive bytes as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC the new hash for BCD is
506      * nextHash(H, D).
507      *
508      * <p>
509      * The hash is shifted by five bits on each update so all effects of A have been swapped after the third update.
510      * </p>
511      */
512     private int nextHash(final int oldHash, final byte nextByte) {
513         final int nextVal = nextByte & 0xFF;
514         return (oldHash << H_SHIFT ^ nextVal) & HASH_MASK;
515     }
516 
517     /**
518      * Adds some initial data to fill the window with.
519      *
520      * <p>
521      * This is used if the stream has been cut into blocks and back-references of one block may refer to data of the previous block(s). One such example is the
522      * LZ4 frame format using block dependency.
523      * </p>
524      *
525      * @param data the data to fill the window with.
526      * @throws IllegalStateException if the compressor has already started to accept data
527      */
528     public void prefill(final byte[] data) {
529         if (currentPosition != 0 || lookahead != 0) {
530             throw new IllegalStateException("The compressor has already started to accept data, can't prefill anymore");
531         }
532 
533         // don't need more than windowSize for back-references
534         final int len = Math.min(params.getWindowSize(), data.length);
535         System.arraycopy(data, data.length - len, window, 0, len);
536 
537         if (len >= NUMBER_OF_BYTES_IN_HASH) {
538             initialize();
539             final int stop = len - NUMBER_OF_BYTES_IN_HASH + 1;
540             for (int i = 0; i < stop; i++) {
541                 insertString(i);
542             }
543             missedInserts = NUMBER_OF_BYTES_IN_HASH - 1;
544         } else { // not enough data to hash anything
545             missedInserts = len;
546         }
547         blockStart = currentPosition = len;
548     }
549 
550     private void slide() throws IOException {
551         final int wSize = params.getWindowSize();
552         if (blockStart != currentPosition && blockStart < wSize) {
553             flushLiteralBlock();
554             blockStart = currentPosition;
555         }
556         System.arraycopy(window, wSize, window, 0, wSize);
557         currentPosition -= wSize;
558         matchStart -= wSize;
559         blockStart -= wSize;
560         for (int i = 0; i < HASH_SIZE; i++) {
561             final int h = head[i];
562             head[i] = h >= wSize ? h - wSize : NO_MATCH;
563         }
564         for (int i = 0; i < wSize; i++) {
565             final int p = prev[i];
566             prev[i] = p >= wSize ? p - wSize : NO_MATCH;
567         }
568     }
569 }