LZ77Compressor.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one
  3.  * or more contributor license agreements.  See the NOTICE file
  4.  * distributed with this work for additional information
  5.  * regarding copyright ownership.  The ASF licenses this file
  6.  * to you under the Apache License, Version 2.0 (the
  7.  * "License"); you may not use this file except in compliance
  8.  * with the License.  You may obtain a copy of the License at
  9.  *
  10.  *   https://www.apache.org/licenses/LICENSE-2.0
  11.  *
  12.  * Unless required by applicable law or agreed to in writing,
  13.  * software distributed under the License is distributed on an
  14.  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  15.  * KIND, either express or implied.  See the License for the
  16.  * specific language governing permissions and limitations
  17.  * under the License.
  18.  */
  19. package org.apache.commons.compress.compressors.lz77support;

  20. import java.io.IOException;
  21. import java.util.Objects;

  22. import org.apache.commons.lang3.ArrayFill;

  23. /**
  24.  * Helper class for compression algorithms that use the ideas of LZ77.
  25.  *
  26.  * <p>
  27.  * Most LZ77 derived algorithms split input data into blocks of uncompressed data (called literal blocks) and back-references (pairs of offsets and lengths)
  28.  * that state "add {@code length} bytes that are the same as those already written starting {@code offset} bytes before the current position. The details of how
  29.  * those blocks and back-references are encoded are quite different between the algorithms and some algorithms perform additional steps (Huffman encoding in the
  30.  * case of DEFLATE for example).
  31.  * </p>
  32.  *
  33.  * <p>
  34.  * This class attempts to extract the core logic - finding back-references - so it can be re-used. It follows the algorithm explained in section 4 of RFC 1951
  35.  * (DEFLATE) and currently doesn't implement the "lazy match" optimization. The three-byte hash function used in this class is the same as the one used by zlib
  36.  * and InfoZIP's ZIP implementation of DEFLATE. The whole class is strongly inspired by InfoZIP's implementation.
  37.  * </p>
  38.  *
  39.  * <p>
  40.  * LZ77 is used vaguely here (as well as many other places that talk about it :-), LZSS would likely be closer to the truth but LZ77 has become the synonym for
  41.  * a whole family of algorithms.
  42.  * </p>
  43.  *
  44.  * <p>
  45.  * The API consists of a compressor that is fed {@code byte}s and emits {@link Block}s to a registered callback where the blocks represent either
  46.  * {@link LiteralBlock literal blocks}, {@link BackReference back-references} or {@link EOD end of data markers}. In order to ensure the callback receives all
  47.  * information, the {@code #finish} method must be used once all data has been fed into the compressor.
  48.  * </p>
  49.  *
  50.  * <p>
  51.  * Several parameters influence the outcome of the "compression":
  52.  * </p>
  53.  * <dl>
  54.  *
  55.  * <dt>{@code windowSize}</dt>
  56.  * <dd>the size of the sliding window, must be a power of two - this determines the maximum offset a back-reference can take. The compressor maintains a buffer
  57.  * of twice of {@code windowSize} - real world values are in the area of 32k.</dd>
  58.  *
  59.  * <dt>{@code minBackReferenceLength}</dt>
  60.  * <dd>Minimal length of a back-reference found. A true minimum of 3 is hard-coded inside of this implementation but bigger lengths can be configured.</dd>
  61.  *
  62.  * <dt>{@code maxBackReferenceLength}</dt>
  63.  * <dd>Maximal length of a back-reference found.</dd>
  64.  *
  65.  * <dt>{@code maxOffset}</dt>
  66.  * <dd>Maximal offset of a back-reference.</dd>
  67.  *
  68.  * <dt>{@code maxLiteralLength}</dt>
  69.  * <dd>Maximal length of a literal block.</dd>
  70.  * </dl>
  71.  *
  72.  * @see "https://tools.ietf.org/html/rfc1951#section-4"
  73.  * @since 1.14
  74.  * @NotThreadSafe
  75.  */
  76. public class LZ77Compressor {

  77.     /**
  78.      * Represents a back-reference.
  79.      */
  80.     public abstract static class AbstractReference extends Block {

  81.         private final int offset;
  82.         private final int length;

  83.         /**
  84.          * Constructs a new instance.
  85.          *
  86.          * @param blockType The block type.
  87.          * @param offset the offset of the reference.
  88.          * @param length the offset of the reference.
  89.          */
  90.         public AbstractReference(final BlockType blockType, final int offset, final int length) {
  91.             super(blockType);
  92.             this.offset = offset;
  93.             this.length = length;
  94.         }

  95.         /**
  96.          * Gets the offset of the reference.
  97.          *
  98.          * @return the length
  99.          */
  100.         public int getLength() {
  101.             return length;
  102.         }

  103.         /**
  104.          * Gets the offset of the reference.
  105.          *
  106.          * @return the offset
  107.          */
  108.         public int getOffset() {
  109.             return offset;
  110.         }

  111.         @Override
  112.         public String toString() {
  113.             return super.toString() + " with offset " + offset + " and length " + length;
  114.         }
  115.     }

  116.     /**
  117.      * Represents a back-reference.
  118.      */
  119.     public static final class BackReference extends AbstractReference {

  120.         /**
  121.          * Constructs a new instance.
  122.          *
  123.          * @param offset the offset of the back-reference.
  124.          * @param length the offset of the back-reference.
  125.          */
  126.         public BackReference(final int offset, final int length) {
  127.             super(BlockType.BACK_REFERENCE, offset, length);
  128.         }

  129.     }

  130.     /**
  131.      * Base class representing blocks the compressor may emit.
  132.      *
  133.      * <p>
  134.      * This class is not supposed to be subclassed by classes outside of Commons Compress so it is considered internal and changed that would break subclasses
  135.      * may get introduced with future releases.
  136.      * </p>
  137.      */
  138.     public abstract static class Block {

  139.         /**
  140.          * Enumerates the block types the compressor emits.
  141.          */
  142.         public enum BlockType {

  143.             /**
  144.              * The literal block type.
  145.              */
  146.             LITERAL,

  147.             /**
  148.              * The back-reference block type.
  149.              */
  150.             BACK_REFERENCE,

  151.             /**
  152.              * The end-of-data block type.
  153.              */
  154.             EOD
  155.         }

  156.         private final BlockType type;

  157.         /**
  158.          * Constructs a new typeless instance.
  159.          *
  160.          * @deprecated Use {@link #Block()}.
  161.          */
  162.         @Deprecated
  163.         public Block() {
  164.             this.type = null;
  165.         }

  166.         /**
  167.          * Constructs a new instance.
  168.          *
  169.          * @param type the block type, may not be {@code null}.
  170.          */
  171.         protected Block(final BlockType type) {
  172.             this.type = Objects.requireNonNull(type);
  173.         }

  174.         /**
  175.          * Gets the the block type.
  176.          *
  177.          * @return the the block type.
  178.          */
  179.         public BlockType getType() {
  180.             return type;
  181.         }

  182.         @Override
  183.         public String toString() {
  184.             return getClass().getSimpleName() + " " + getType();
  185.         }
  186.     }

  187.     /**
  188.      * Callback invoked while the compressor processes data.
  189.      *
  190.      * <p>
  191.      * The callback is invoked on the same thread that receives the bytes to compress and may be invoked multiple times during the execution of
  192.      * {@link #compress} or {@link #finish}.
  193.      * </p>
  194.      */
  195.     public interface Callback {

  196.         /**
  197.          * Consumes a block.
  198.          *
  199.          * @param b the block to consume
  200.          * @throws IOException in case of an error
  201.          */
  202.         void accept(Block b) throws IOException;
  203.     }

  204.     /** A simple "we are done" marker. */
  205.     public static final class EOD extends Block {

  206.         /**
  207.          * The singleton instance.
  208.          */
  209.         private static final EOD INSTANCE = new EOD();

  210.         /**
  211.          * Constructs a new instance.
  212.          */
  213.         public EOD() {
  214.             super(BlockType.EOD);
  215.         }

  216.     }

  217.     /**
  218.      * Represents a literal block of data.
  219.      *
  220.      * <p>
  221.      * For performance reasons this encapsulates the real data, not a copy of it. Don't modify the data and process it inside of {@link Callback#accept}
  222.      * immediately as it will get overwritten sooner or later.
  223.      * </p>
  224.      */
  225.     public static final class LiteralBlock extends AbstractReference {

  226.         private final byte[] data;

  227.         /**
  228.          * Constructs a new instance.
  229.          *
  230.          * @param data the literal data.
  231.          * @param offset the length of literal block.
  232.          * @param length the length of literal block.
  233.          */
  234.         public LiteralBlock(final byte[] data, final int offset, final int length) {
  235.             super(BlockType.LITERAL, offset, length);
  236.             this.data = data;
  237.         }

  238.         /**
  239.          * Gets the literal data.
  240.          *
  241.          * <p>
  242.          * This returns a live view of the actual data in order to avoid copying, modify the array at your own risk.
  243.          * </p>
  244.          *
  245.          * @return the data
  246.          */
  247.         public byte[] getData() {
  248.             return data;
  249.         }

  250.     }

  251.     static final int NUMBER_OF_BYTES_IN_HASH = 3;
  252.     private static final int NO_MATCH = -1;

  253.     // we use a 15 bit hash code as calculated in updateHash
  254.     private static final int HASH_SIZE = 1 << 15;
  255.     private static final int HASH_MASK = HASH_SIZE - 1;

  256.     private static final int H_SHIFT = 5;
  257.     private final Parameters params;
  258.     private final Callback callback;

  259.     // the sliding window, twice as big as "windowSize" parameter
  260.     private final byte[] window;

  261.     // the head of hash-chain - indexed by hash-code, points to the
  262.     // location inside of window of the latest sequence of bytes with
  263.     // the given hash.
  264.     private final int[] head;
  265.     // for each window-location points to the latest earlier location
  266.     // with the same hash. Only stores values for the latest
  267.     // "windowSize" elements, the index is "window location modulo
  268.     // windowSize".
  269.     private final int[] prev;
  270.     // bit mask used when indexing into prev
  271.     private final int wMask;
  272.     private boolean initialized;
  273.     // the position inside of window that shall be encoded right now
  274.     private int currentPosition;
  275.     // the number of bytes available to compress including the one at
  276.     // currentPosition
  277.     private int lookahead;
  278.     // the hash of the three bytes stating at the current position
  279.     private int insertHash;

  280.     // the position inside the window where the current literal
  281.     // block starts (in case we are inside a literal block).
  282.     private int blockStart;

  283.     // position of the current match
  284.     private int matchStart = NO_MATCH;

  285.     // number of missed insertString calls for the up to three last
  286.     // bytes of the last match that can only be performed once more
  287.     // data has been read
  288.     private int missedInserts;

  289.     /**
  290.      * Initializes a compressor with parameters and a callback.
  291.      *
  292.      * @param params   the parameters
  293.      * @param callback the callback
  294.      * @throws NullPointerException if either parameter is {@code null}
  295.      */
  296.     public LZ77Compressor(final Parameters params, final Callback callback) {
  297.         Objects.requireNonNull(params, "params");
  298.         Objects.requireNonNull(callback, "callback");

  299.         this.params = params;
  300.         this.callback = callback;

  301.         final int wSize = params.getWindowSize();
  302.         window = new byte[wSize * 2];
  303.         wMask = wSize - 1;
  304.         head = ArrayFill.fill(new int[HASH_SIZE], NO_MATCH);
  305.         prev = new int[wSize];
  306.     }

  307.     private void catchUpMissedInserts() {
  308.         while (missedInserts > 0) {
  309.             insertString(currentPosition - missedInserts--);
  310.         }
  311.     }

  312.     private void compress() throws IOException {
  313.         final int minMatch = params.getMinBackReferenceLength();
  314.         final boolean lazy = params.getLazyMatching();
  315.         final int lazyThreshold = params.getLazyMatchingThreshold();

  316.         while (lookahead >= minMatch) {
  317.             catchUpMissedInserts();
  318.             int matchLength = 0;
  319.             final int hashHead = insertString(currentPosition);
  320.             if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) {
  321.                 // sets matchStart as a side effect
  322.                 matchLength = longestMatch(hashHead);

  323.                 if (lazy && matchLength <= lazyThreshold && lookahead > minMatch) {
  324.                     // try to find a longer match using the next position
  325.                     matchLength = longestMatchForNextPosition(matchLength);
  326.                 }
  327.             }
  328.             if (matchLength >= minMatch) {
  329.                 if (blockStart != currentPosition) {
  330.                     // emit preceding literal block
  331.                     flushLiteralBlock();
  332.                     blockStart = NO_MATCH;
  333.                 }
  334.                 flushBackReference(matchLength);
  335.                 insertStringsInMatch(matchLength);
  336.                 lookahead -= matchLength;
  337.                 currentPosition += matchLength;
  338.                 blockStart = currentPosition;
  339.             } else {
  340.                 // no match, append to current or start a new literal
  341.                 lookahead--;
  342.                 currentPosition++;
  343.                 if (currentPosition - blockStart >= params.getMaxLiteralLength()) {
  344.                     flushLiteralBlock();
  345.                     blockStart = currentPosition;
  346.                 }
  347.             }
  348.         }
  349.     }

  350.     /**
  351.      * Feeds bytes into the compressor which in turn may emit zero or more blocks to the callback during the execution of this method.
  352.      *
  353.      * @param data the data to compress - must not be null
  354.      * @throws IOException if the callback throws an exception
  355.      */
  356.     public void compress(final byte[] data) throws IOException {
  357.         compress(data, 0, data.length);
  358.     }

  359.     /**
  360.      * Feeds bytes into the compressor which in turn may emit zero or more blocks to the callback during the execution of this method.
  361.      *
  362.      * @param data the data to compress - must not be null
  363.      * @param off  the start offset of the data
  364.      * @param len  the number of bytes to compress
  365.      * @throws IOException if the callback throws an exception
  366.      */
  367.     public void compress(final byte[] data, int off, int len) throws IOException {
  368.         final int wSize = params.getWindowSize();
  369.         while (len > wSize) { // chop into windowSize sized chunks
  370.             doCompress(data, off, wSize);
  371.             off += wSize;
  372.             len -= wSize;
  373.         }
  374.         if (len > 0) {
  375.             doCompress(data, off, len);
  376.         }
  377.     }

  378.     // performs the actual algorithm with the pre-condition len <= windowSize
  379.     private void doCompress(final byte[] data, final int off, final int len) throws IOException {
  380.         final int spaceLeft = window.length - currentPosition - lookahead;
  381.         if (len > spaceLeft) {
  382.             slide();
  383.         }
  384.         System.arraycopy(data, off, window, currentPosition + lookahead, len);
  385.         lookahead += len;
  386.         if (!initialized && lookahead >= params.getMinBackReferenceLength()) {
  387.             initialize();
  388.         }
  389.         if (initialized) {
  390.             compress();
  391.         }
  392.     }

  393.     /**
  394.      * Tells the compressor to process all remaining data and signal end of data to the callback.
  395.      *
  396.      * <p>
  397.      * The compressor will in turn emit at least one block ({@link EOD}) but potentially multiple blocks to the callback during the execution of this method.
  398.      * </p>
  399.      *
  400.      * @throws IOException if the callback throws an exception
  401.      */
  402.     public void finish() throws IOException {
  403.         if (blockStart != currentPosition || lookahead > 0) {
  404.             currentPosition += lookahead;
  405.             flushLiteralBlock();
  406.         }
  407.         callback.accept(EOD.INSTANCE);
  408.     }

  409.     private void flushBackReference(final int matchLength) throws IOException {
  410.         callback.accept(new BackReference(currentPosition - matchStart, matchLength));
  411.     }

  412.     private void flushLiteralBlock() throws IOException {
  413.         callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart));
  414.     }

  415.     private void initialize() {
  416.         for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) {
  417.             insertHash = nextHash(insertHash, window[i]);
  418.         }
  419.         initialized = true;
  420.     }

  421.     /**
  422.      * Inserts the current three byte sequence into the dictionary and returns the previous head of the hash-chain.
  423.      *
  424.      * <p>
  425.      * Updates {@code insertHash} and {@code prev} as a side effect.
  426.      * </p>
  427.      */
  428.     private int insertString(final int pos) {
  429.         insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]);
  430.         final int hashHead = head[insertHash];
  431.         prev[pos & wMask] = hashHead;
  432.         head[insertHash] = pos;
  433.         return hashHead;
  434.     }

  435.     private void insertStringsInMatch(final int matchLength) {
  436.         // inserts strings contained in current match
  437.         // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts
  438.         final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH);
  439.         // currentPosition has been inserted already
  440.         for (int i = 1; i <= stop; i++) {
  441.             insertString(currentPosition + i);
  442.         }
  443.         missedInserts = matchLength - stop - 1;
  444.     }

  445.     /**
  446.      * Searches the hash chain for real matches and returns the length of the longest match (0 if none were found) that isn't too far away (WRT maxOffset).
  447.      *
  448.      * <p>
  449.      * Sets matchStart to the index of the start position of the longest match as a side effect.
  450.      * </p>
  451.      */
  452.     private int longestMatch(int matchHead) {
  453.         final int minLength = params.getMinBackReferenceLength();
  454.         int longestMatchLength = minLength - 1;
  455.         final int maxPossibleLength = Math.min(params.getMaxBackReferenceLength(), lookahead);
  456.         final int minIndex = Math.max(0, currentPosition - params.getMaxOffset());
  457.         final int niceBackReferenceLength = Math.min(maxPossibleLength, params.getNiceBackReferenceLength());
  458.         final int maxCandidates = params.getMaxCandidates();
  459.         for (int candidates = 0; candidates < maxCandidates && matchHead >= minIndex; candidates++) {
  460.             int currentLength = 0;
  461.             for (int i = 0; i < maxPossibleLength; i++) {
  462.                 if (window[matchHead + i] != window[currentPosition + i]) {
  463.                     break;
  464.                 }
  465.                 currentLength++;
  466.             }
  467.             if (currentLength > longestMatchLength) {
  468.                 longestMatchLength = currentLength;
  469.                 matchStart = matchHead;
  470.                 if (currentLength >= niceBackReferenceLength) {
  471.                     // no need to search any further
  472.                     break;
  473.                 }
  474.             }
  475.             matchHead = prev[matchHead & wMask];
  476.         }
  477.         return longestMatchLength; // < minLength if no matches have been found, will be ignored in compress()
  478.     }

  479.     private int longestMatchForNextPosition(final int prevMatchLength) {
  480.         // save a bunch of values to restore them if the next match isn't better than the current one
  481.         final int prevMatchStart = matchStart;
  482.         final int prevInsertHash = insertHash;

  483.         lookahead--;
  484.         currentPosition++;
  485.         final int hashHead = insertString(currentPosition);
  486.         final int prevHashHead = prev[currentPosition & wMask];
  487.         int matchLength = longestMatch(hashHead);

  488.         if (matchLength <= prevMatchLength) {
  489.             // use the first match, as the next one isn't any better
  490.             matchLength = prevMatchLength;
  491.             matchStart = prevMatchStart;

  492.             // restore modified values
  493.             head[insertHash] = prevHashHead;
  494.             insertHash = prevInsertHash;
  495.             currentPosition--;
  496.             lookahead++;
  497.         }
  498.         return matchLength;
  499.     }

  500.     /**
  501.      * Assumes we are calculating the hash for three consecutive bytes as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC the new hash for BCD is
  502.      * nextHash(H, D).
  503.      *
  504.      * <p>
  505.      * The hash is shifted by five bits on each update so all effects of A have been swapped after the third update.
  506.      * </p>
  507.      */
  508.     private int nextHash(final int oldHash, final byte nextByte) {
  509.         final int nextVal = nextByte & 0xFF;
  510.         return (oldHash << H_SHIFT ^ nextVal) & HASH_MASK;
  511.     }

  512.     /**
  513.      * Adds some initial data to fill the window with.
  514.      *
  515.      * <p>
  516.      * This is used if the stream has been cut into blocks and back-references of one block may refer to data of the previous block(s). One such example is the
  517.      * LZ4 frame format using block dependency.
  518.      * </p>
  519.      *
  520.      * @param data the data to fill the window with.
  521.      * @throws IllegalStateException if the compressor has already started to accept data
  522.      */
  523.     public void prefill(final byte[] data) {
  524.         if (currentPosition != 0 || lookahead != 0) {
  525.             throw new IllegalStateException("The compressor has already started to accept data, can't prefill anymore");
  526.         }

  527.         // don't need more than windowSize for back-references
  528.         final int len = Math.min(params.getWindowSize(), data.length);
  529.         System.arraycopy(data, data.length - len, window, 0, len);

  530.         if (len >= NUMBER_OF_BYTES_IN_HASH) {
  531.             initialize();
  532.             final int stop = len - NUMBER_OF_BYTES_IN_HASH + 1;
  533.             for (int i = 0; i < stop; i++) {
  534.                 insertString(i);
  535.             }
  536.             missedInserts = NUMBER_OF_BYTES_IN_HASH - 1;
  537.         } else { // not enough data to hash anything
  538.             missedInserts = len;
  539.         }
  540.         blockStart = currentPosition = len;
  541.     }

  542.     private void slide() throws IOException {
  543.         final int wSize = params.getWindowSize();
  544.         if (blockStart != currentPosition && blockStart < wSize) {
  545.             flushLiteralBlock();
  546.             blockStart = currentPosition;
  547.         }
  548.         System.arraycopy(window, wSize, window, 0, wSize);
  549.         currentPosition -= wSize;
  550.         matchStart -= wSize;
  551.         blockStart -= wSize;
  552.         for (int i = 0; i < HASH_SIZE; i++) {
  553.             final int h = head[i];
  554.             head[i] = h >= wSize ? h - wSize : NO_MATCH;
  555.         }
  556.         for (int i = 0; i < wSize; i++) {
  557.             final int p = prev[i];
  558.             prev[i] = p >= wSize ? p - wSize : NO_MATCH;
  559.         }
  560.     }
  561. }