LZ77Compressor.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one
  3.  * or more contributor license agreements.  See the NOTICE file
  4.  * distributed with this work for additional information
  5.  * regarding copyright ownership.  The ASF licenses this file
  6.  * to you under the Apache License, Version 2.0 (the
  7.  * "License"); you may not use this file except in compliance
  8.  * with the License.  You may obtain a copy of the License at
  9.  *
  10.  * http://www.apache.org/licenses/LICENSE-2.0
  11.  *
  12.  * Unless required by applicable law or agreed to in writing,
  13.  * software distributed under the License is distributed on an
  14.  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  15.  * KIND, either express or implied.  See the License for the
  16.  * specific language governing permissions and limitations
  17.  * under the License.
  18.  */
  19. package org.apache.commons.compress.compressors.lz77support;

  20. import java.io.IOException;
  21. import java.util.Objects;

  22. import org.apache.commons.lang3.ArrayFill;

  23. /**
  24.  * Helper class for compression algorithms that use the ideas of LZ77.
  25.  *
  26.  * <p>
  27.  * Most LZ77 derived algorithms split input data into blocks of uncompressed data (called literal blocks) and back-references (pairs of offsets and lengths)
  28.  * that state "add {@code length} bytes that are the same as those already written starting {@code offset} bytes before the current position. The details of how
  29.  * those blocks and back-references are encoded are quite different between the algorithms and some algorithms perform additional steps (Huffman encoding in the
  30.  * case of DEFLATE for example).
  31.  * </p>
  32.  *
  33.  * <p>
  34.  * This class attempts to extract the core logic - finding back-references - so it can be re-used. It follows the algorithm explained in section 4 of RFC 1951
  35.  * (DEFLATE) and currently doesn't implement the "lazy match" optimization. The three-byte hash function used in this class is the same as the one used by zlib
  36.  * and InfoZIP's ZIP implementation of DEFLATE. The whole class is strongly inspired by InfoZIP's implementation.
  37.  * </p>
  38.  *
  39.  * <p>
  40.  * LZ77 is used vaguely here (as well as many other places that talk about it :-), LZSS would likely be closer to the truth but LZ77 has become the synonym for
  41.  * a whole family of algorithms.
  42.  * </p>
  43.  *
  44.  * <p>
  45.  * The API consists of a compressor that is fed {@code byte}s and emits {@link Block}s to a registered callback where the blocks represent either
  46.  * {@link LiteralBlock literal blocks}, {@link BackReference back-references} or {@link EOD end of data markers}. In order to ensure the callback receives all
  47.  * information, the {@code #finish} method must be used once all data has been fed into the compressor.
  48.  * </p>
  49.  *
  50.  * <p>
  51.  * Several parameters influence the outcome of the "compression":
  52.  * </p>
  53.  * <dl>
  54.  *
  55.  * <dt>{@code windowSize}</dt>
  56.  * <dd>the size of the sliding window, must be a power of two - this determines the maximum offset a back-reference can take. The compressor maintains a buffer
  57.  * of twice of {@code windowSize} - real world values are in the area of 32k.</dd>
  58.  *
  59.  * <dt>{@code minBackReferenceLength}</dt>
  60.  * <dd>Minimal length of a back-reference found. A true minimum of 3 is hard-coded inside of this implementation but bigger lengths can be configured.</dd>
  61.  *
  62.  * <dt>{@code maxBackReferenceLength}</dt>
  63.  * <dd>Maximal length of a back-reference found.</dd>
  64.  *
  65.  * <dt>{@code maxOffset}</dt>
  66.  * <dd>Maximal offset of a back-reference.</dd>
  67.  *
  68.  * <dt>{@code maxLiteralLength}</dt>
  69.  * <dd>Maximal length of a literal block.</dd>
  70.  * </dl>
  71.  *
  72.  * @see "https://tools.ietf.org/html/rfc1951#section-4"
  73.  * @since 1.14
  74.  * @NotThreadSafe
  75.  */
  76. public class LZ77Compressor {

  77.     /**
  78.      * Represents a back-reference.
  79.      */
  80.     public static final class BackReference extends Block {
  81.         private final int offset, length;

  82.         public BackReference(final int offset, final int length) {
  83.             this.offset = offset;
  84.             this.length = length;
  85.         }

  86.         /**
  87.          * Provides the length of the back-reference.
  88.          *
  89.          * @return the length
  90.          */
  91.         public int getLength() {
  92.             return length;
  93.         }

  94.         /**
  95.          * Provides the offset of the back-reference.
  96.          *
  97.          * @return the offset
  98.          */
  99.         public int getOffset() {
  100.             return offset;
  101.         }

  102.         @Override
  103.         public BlockType getType() {
  104.             return BlockType.BACK_REFERENCE;
  105.         }

  106.         @Override
  107.         public String toString() {
  108.             return "BackReference with offset " + offset + " and length " + length;
  109.         }
  110.     }

  111.     /**
  112.      * Base class representing blocks the compressor may emit.
  113.      *
  114.      * <p>
  115.      * This class is not supposed to be subclassed by classes outside of Commons Compress so it is considered internal and changed that would break subclasses
  116.      * may get introduced with future releases.
  117.      * </p>
  118.      */
  119.     public abstract static class Block {
  120.         /** Enumeration of the block types the compressor may emit. */
  121.         public enum BlockType {
  122.             LITERAL, BACK_REFERENCE, EOD
  123.         }

  124.         public abstract BlockType getType();
  125.     }

  126.     /**
  127.      * Callback invoked while the compressor processes data.
  128.      *
  129.      * <p>
  130.      * The callback is invoked on the same thread that receives the bytes to compress and may be invoked multiple times during the execution of
  131.      * {@link #compress} or {@link #finish}.
  132.      * </p>
  133.      */
  134.     public interface Callback {
  135.         /**
  136.          * Consumes a block.
  137.          *
  138.          * @param b the block to consume
  139.          * @throws IOException in case of an error
  140.          */
  141.         void accept(Block b) throws IOException;
  142.     }

  143.     /** A simple "we are done" marker. */
  144.     public static final class EOD extends Block {
  145.         @Override
  146.         public BlockType getType() {
  147.             return BlockType.EOD;
  148.         }
  149.     }

  150.     /**
  151.      * Represents a literal block of data.
  152.      *
  153.      * <p>
  154.      * For performance reasons this encapsulates the real data, not a copy of it. Don't modify the data and process it inside of {@link Callback#accept}
  155.      * immediately as it will get overwritten sooner or later.
  156.      * </p>
  157.      */
  158.     public static final class LiteralBlock extends Block {
  159.         private final byte[] data;
  160.         private final int offset, length;

  161.         public LiteralBlock(final byte[] data, final int offset, final int length) {
  162.             this.data = data;
  163.             this.offset = offset;
  164.             this.length = length;
  165.         }

  166.         /**
  167.          * The literal data.
  168.          *
  169.          * <p>
  170.          * This returns a life view of the actual data in order to avoid copying, modify the array at your own risk.
  171.          * </p>
  172.          *
  173.          * @return the data
  174.          */
  175.         public byte[] getData() {
  176.             return data;
  177.         }

  178.         /**
  179.          * Length of literal block.
  180.          *
  181.          * @return the length
  182.          */
  183.         public int getLength() {
  184.             return length;
  185.         }

  186.         /**
  187.          * Offset into data where the literal block starts.
  188.          *
  189.          * @return the offset
  190.          */
  191.         public int getOffset() {
  192.             return offset;
  193.         }

  194.         @Override
  195.         public BlockType getType() {
  196.             return BlockType.LITERAL;
  197.         }

  198.         @Override
  199.         public String toString() {
  200.             return "LiteralBlock starting at " + offset + " with length " + length;
  201.         }
  202.     }

  203.     private static final Block THE_EOD = new EOD();

  204.     static final int NUMBER_OF_BYTES_IN_HASH = 3;
  205.     private static final int NO_MATCH = -1;

  206.     // we use a 15 bit hash code as calculated in updateHash
  207.     private static final int HASH_SIZE = 1 << 15;
  208.     private static final int HASH_MASK = HASH_SIZE - 1;

  209.     private static final int H_SHIFT = 5;
  210.     private final Parameters params;
  211.     private final Callback callback;

  212.     // the sliding window, twice as big as "windowSize" parameter
  213.     private final byte[] window;

  214.     // the head of hash-chain - indexed by hash-code, points to the
  215.     // location inside of window of the latest sequence of bytes with
  216.     // the given hash.
  217.     private final int[] head;
  218.     // for each window-location points to the latest earlier location
  219.     // with the same hash. Only stores values for the latest
  220.     // "windowSize" elements, the index is "window location modulo
  221.     // windowSize".
  222.     private final int[] prev;
  223.     // bit mask used when indexing into prev
  224.     private final int wMask;
  225.     private boolean initialized;
  226.     // the position inside of window that shall be encoded right now
  227.     private int currentPosition;
  228.     // the number of bytes available to compress including the one at
  229.     // currentPosition
  230.     private int lookahead;
  231.     // the hash of the three bytes stating at the current position
  232.     private int insertHash;

  233.     // the position inside the window where the current literal
  234.     // block starts (in case we are inside a literal block).
  235.     private int blockStart;

  236.     // position of the current match
  237.     private int matchStart = NO_MATCH;

  238.     // number of missed insertString calls for the up to three last
  239.     // bytes of the last match that can only be performed once more
  240.     // data has been read
  241.     private int missedInserts;

  242.     /**
  243.      * Initializes a compressor with parameters and a callback.
  244.      *
  245.      * @param params   the parameters
  246.      * @param callback the callback
  247.      * @throws NullPointerException if either parameter is {@code null}
  248.      */
  249.     public LZ77Compressor(final Parameters params, final Callback callback) {
  250.         Objects.requireNonNull(params, "params");
  251.         Objects.requireNonNull(callback, "callback");

  252.         this.params = params;
  253.         this.callback = callback;

  254.         final int wSize = params.getWindowSize();
  255.         window = new byte[wSize * 2];
  256.         wMask = wSize - 1;
  257.         head = ArrayFill.fill(new int[HASH_SIZE], NO_MATCH);
  258.         prev = new int[wSize];
  259.     }

  260.     private void catchUpMissedInserts() {
  261.         while (missedInserts > 0) {
  262.             insertString(currentPosition - missedInserts--);
  263.         }
  264.     }

  265.     private void compress() throws IOException {
  266.         final int minMatch = params.getMinBackReferenceLength();
  267.         final boolean lazy = params.getLazyMatching();
  268.         final int lazyThreshold = params.getLazyMatchingThreshold();

  269.         while (lookahead >= minMatch) {
  270.             catchUpMissedInserts();
  271.             int matchLength = 0;
  272.             final int hashHead = insertString(currentPosition);
  273.             if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) {
  274.                 // sets matchStart as a side effect
  275.                 matchLength = longestMatch(hashHead);

  276.                 if (lazy && matchLength <= lazyThreshold && lookahead > minMatch) {
  277.                     // try to find a longer match using the next position
  278.                     matchLength = longestMatchForNextPosition(matchLength);
  279.                 }
  280.             }
  281.             if (matchLength >= minMatch) {
  282.                 if (blockStart != currentPosition) {
  283.                     // emit preceding literal block
  284.                     flushLiteralBlock();
  285.                     blockStart = NO_MATCH;
  286.                 }
  287.                 flushBackReference(matchLength);
  288.                 insertStringsInMatch(matchLength);
  289.                 lookahead -= matchLength;
  290.                 currentPosition += matchLength;
  291.                 blockStart = currentPosition;
  292.             } else {
  293.                 // no match, append to current or start a new literal
  294.                 lookahead--;
  295.                 currentPosition++;
  296.                 if (currentPosition - blockStart >= params.getMaxLiteralLength()) {
  297.                     flushLiteralBlock();
  298.                     blockStart = currentPosition;
  299.                 }
  300.             }
  301.         }
  302.     }

  303.     /**
  304.      * Feeds bytes into the compressor which in turn may emit zero or more blocks to the callback during the execution of this method.
  305.      *
  306.      * @param data the data to compress - must not be null
  307.      * @throws IOException if the callback throws an exception
  308.      */
  309.     public void compress(final byte[] data) throws IOException {
  310.         compress(data, 0, data.length);
  311.     }

  312.     /**
  313.      * Feeds bytes into the compressor which in turn may emit zero or more blocks to the callback during the execution of this method.
  314.      *
  315.      * @param data the data to compress - must not be null
  316.      * @param off  the start offset of the data
  317.      * @param len  the number of bytes to compress
  318.      * @throws IOException if the callback throws an exception
  319.      */
  320.     public void compress(final byte[] data, int off, int len) throws IOException {
  321.         final int wSize = params.getWindowSize();
  322.         while (len > wSize) { // chop into windowSize sized chunks
  323.             doCompress(data, off, wSize);
  324.             off += wSize;
  325.             len -= wSize;
  326.         }
  327.         if (len > 0) {
  328.             doCompress(data, off, len);
  329.         }
  330.     }

  331.     // performs the actual algorithm with the pre-condition len <= windowSize
  332.     private void doCompress(final byte[] data, final int off, final int len) throws IOException {
  333.         final int spaceLeft = window.length - currentPosition - lookahead;
  334.         if (len > spaceLeft) {
  335.             slide();
  336.         }
  337.         System.arraycopy(data, off, window, currentPosition + lookahead, len);
  338.         lookahead += len;
  339.         if (!initialized && lookahead >= params.getMinBackReferenceLength()) {
  340.             initialize();
  341.         }
  342.         if (initialized) {
  343.             compress();
  344.         }
  345.     }

  346.     /**
  347.      * Tells the compressor to process all remaining data and signal end of data to the callback.
  348.      *
  349.      * <p>
  350.      * The compressor will in turn emit at least one block ({@link EOD}) but potentially multiple blocks to the callback during the execution of this method.
  351.      * </p>
  352.      *
  353.      * @throws IOException if the callback throws an exception
  354.      */
  355.     public void finish() throws IOException {
  356.         if (blockStart != currentPosition || lookahead > 0) {
  357.             currentPosition += lookahead;
  358.             flushLiteralBlock();
  359.         }
  360.         callback.accept(THE_EOD);
  361.     }

  362.     private void flushBackReference(final int matchLength) throws IOException {
  363.         callback.accept(new BackReference(currentPosition - matchStart, matchLength));
  364.     }

  365.     private void flushLiteralBlock() throws IOException {
  366.         callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart));
  367.     }

  368.     private void initialize() {
  369.         for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) {
  370.             insertHash = nextHash(insertHash, window[i]);
  371.         }
  372.         initialized = true;
  373.     }

  374.     /**
  375.      * Inserts the current three byte sequence into the dictionary and returns the previous head of the hash-chain.
  376.      *
  377.      * <p>
  378.      * Updates {@code insertHash} and {@code prev} as a side effect.
  379.      * </p>
  380.      */
  381.     private int insertString(final int pos) {
  382.         insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]);
  383.         final int hashHead = head[insertHash];
  384.         prev[pos & wMask] = hashHead;
  385.         head[insertHash] = pos;
  386.         return hashHead;
  387.     }

  388.     private void insertStringsInMatch(final int matchLength) {
  389.         // inserts strings contained in current match
  390.         // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts
  391.         final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH);
  392.         // currentPosition has been inserted already
  393.         for (int i = 1; i <= stop; i++) {
  394.             insertString(currentPosition + i);
  395.         }
  396.         missedInserts = matchLength - stop - 1;
  397.     }

  398.     /**
  399.      * Searches the hash chain for real matches and returns the length of the longest match (0 if none were found) that isn't too far away (WRT maxOffset).
  400.      *
  401.      * <p>
  402.      * Sets matchStart to the index of the start position of the longest match as a side effect.
  403.      * </p>
  404.      */
  405.     private int longestMatch(int matchHead) {
  406.         final int minLength = params.getMinBackReferenceLength();
  407.         int longestMatchLength = minLength - 1;
  408.         final int maxPossibleLength = Math.min(params.getMaxBackReferenceLength(), lookahead);
  409.         final int minIndex = Math.max(0, currentPosition - params.getMaxOffset());
  410.         final int niceBackReferenceLength = Math.min(maxPossibleLength, params.getNiceBackReferenceLength());
  411.         final int maxCandidates = params.getMaxCandidates();
  412.         for (int candidates = 0; candidates < maxCandidates && matchHead >= minIndex; candidates++) {
  413.             int currentLength = 0;
  414.             for (int i = 0; i < maxPossibleLength; i++) {
  415.                 if (window[matchHead + i] != window[currentPosition + i]) {
  416.                     break;
  417.                 }
  418.                 currentLength++;
  419.             }
  420.             if (currentLength > longestMatchLength) {
  421.                 longestMatchLength = currentLength;
  422.                 matchStart = matchHead;
  423.                 if (currentLength >= niceBackReferenceLength) {
  424.                     // no need to search any further
  425.                     break;
  426.                 }
  427.             }
  428.             matchHead = prev[matchHead & wMask];
  429.         }
  430.         return longestMatchLength; // < minLength if no matches have been found, will be ignored in compress()
  431.     }

  432.     private int longestMatchForNextPosition(final int prevMatchLength) {
  433.         // save a bunch of values to restore them if the next match isn't better than the current one
  434.         final int prevMatchStart = matchStart;
  435.         final int prevInsertHash = insertHash;

  436.         lookahead--;
  437.         currentPosition++;
  438.         final int hashHead = insertString(currentPosition);
  439.         final int prevHashHead = prev[currentPosition & wMask];
  440.         int matchLength = longestMatch(hashHead);

  441.         if (matchLength <= prevMatchLength) {
  442.             // use the first match, as the next one isn't any better
  443.             matchLength = prevMatchLength;
  444.             matchStart = prevMatchStart;

  445.             // restore modified values
  446.             head[insertHash] = prevHashHead;
  447.             insertHash = prevInsertHash;
  448.             currentPosition--;
  449.             lookahead++;
  450.         }
  451.         return matchLength;
  452.     }

  453.     /**
  454.      * Assumes we are calculating the hash for three consecutive bytes as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC the new hash for BCD is
  455.      * nextHash(H, D).
  456.      *
  457.      * <p>
  458.      * The hash is shifted by five bits on each update so all effects of A have been swapped after the third update.
  459.      * </p>
  460.      */
  461.     private int nextHash(final int oldHash, final byte nextByte) {
  462.         final int nextVal = nextByte & 0xFF;
  463.         return (oldHash << H_SHIFT ^ nextVal) & HASH_MASK;
  464.     }

  465.     /**
  466.      * Adds some initial data to fill the window with.
  467.      *
  468.      * <p>
  469.      * This is used if the stream has been cut into blocks and back-references of one block may refer to data of the previous block(s). One such example is the
  470.      * LZ4 frame format using block dependency.
  471.      * </p>
  472.      *
  473.      * @param data the data to fill the window with.
  474.      * @throws IllegalStateException if the compressor has already started to accept data
  475.      */
  476.     public void prefill(final byte[] data) {
  477.         if (currentPosition != 0 || lookahead != 0) {
  478.             throw new IllegalStateException("The compressor has already started to accept data, can't prefill anymore");
  479.         }

  480.         // don't need more than windowSize for back-references
  481.         final int len = Math.min(params.getWindowSize(), data.length);
  482.         System.arraycopy(data, data.length - len, window, 0, len);

  483.         if (len >= NUMBER_OF_BYTES_IN_HASH) {
  484.             initialize();
  485.             final int stop = len - NUMBER_OF_BYTES_IN_HASH + 1;
  486.             for (int i = 0; i < stop; i++) {
  487.                 insertString(i);
  488.             }
  489.             missedInserts = NUMBER_OF_BYTES_IN_HASH - 1;
  490.         } else { // not enough data to hash anything
  491.             missedInserts = len;
  492.         }
  493.         blockStart = currentPosition = len;
  494.     }

  495.     private void slide() throws IOException {
  496.         final int wSize = params.getWindowSize();
  497.         if (blockStart != currentPosition && blockStart < wSize) {
  498.             flushLiteralBlock();
  499.             blockStart = currentPosition;
  500.         }
  501.         System.arraycopy(window, wSize, window, 0, wSize);
  502.         currentPosition -= wSize;
  503.         matchStart -= wSize;
  504.         blockStart -= wSize;
  505.         for (int i = 0; i < HASH_SIZE; i++) {
  506.             final int h = head[i];
  507.             head[i] = h >= wSize ? h - wSize : NO_MATCH;
  508.         }
  509.         for (int i = 0; i < wSize; i++) {
  510.             final int p = prev[i];
  511.             prev[i] = p >= wSize ? p - wSize : NO_MATCH;
  512.         }
  513.     }
  514. }