LZ77Compressor.java
- /*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
- package org.apache.commons.compress.compressors.lz77support;
- import java.io.IOException;
- import java.util.Objects;
- import org.apache.commons.lang3.ArrayFill;
- /**
- * Helper class for compression algorithms that use the ideas of LZ77.
- *
- * <p>
- * Most LZ77 derived algorithms split input data into blocks of uncompressed data (called literal blocks) and back-references (pairs of offsets and lengths)
- * that state "add {@code length} bytes that are the same as those already written starting {@code offset} bytes before the current position. The details of how
- * those blocks and back-references are encoded are quite different between the algorithms and some algorithms perform additional steps (Huffman encoding in the
- * case of DEFLATE for example).
- * </p>
- *
- * <p>
- * This class attempts to extract the core logic - finding back-references - so it can be re-used. It follows the algorithm explained in section 4 of RFC 1951
- * (DEFLATE) and currently doesn't implement the "lazy match" optimization. The three-byte hash function used in this class is the same as the one used by zlib
- * and InfoZIP's ZIP implementation of DEFLATE. The whole class is strongly inspired by InfoZIP's implementation.
- * </p>
- *
- * <p>
- * LZ77 is used vaguely here (as well as many other places that talk about it :-), LZSS would likely be closer to the truth but LZ77 has become the synonym for
- * a whole family of algorithms.
- * </p>
- *
- * <p>
- * The API consists of a compressor that is fed {@code byte}s and emits {@link Block}s to a registered callback where the blocks represent either
- * {@link LiteralBlock literal blocks}, {@link BackReference back-references} or {@link EOD end of data markers}. In order to ensure the callback receives all
- * information, the {@code #finish} method must be used once all data has been fed into the compressor.
- * </p>
- *
- * <p>
- * Several parameters influence the outcome of the "compression":
- * </p>
- * <dl>
- *
- * <dt>{@code windowSize}</dt>
- * <dd>the size of the sliding window, must be a power of two - this determines the maximum offset a back-reference can take. The compressor maintains a buffer
- * of twice of {@code windowSize} - real world values are in the area of 32k.</dd>
- *
- * <dt>{@code minBackReferenceLength}</dt>
- * <dd>Minimal length of a back-reference found. A true minimum of 3 is hard-coded inside of this implementation but bigger lengths can be configured.</dd>
- *
- * <dt>{@code maxBackReferenceLength}</dt>
- * <dd>Maximal length of a back-reference found.</dd>
- *
- * <dt>{@code maxOffset}</dt>
- * <dd>Maximal offset of a back-reference.</dd>
- *
- * <dt>{@code maxLiteralLength}</dt>
- * <dd>Maximal length of a literal block.</dd>
- * </dl>
- *
- * @see "https://tools.ietf.org/html/rfc1951#section-4"
- * @since 1.14
- * @NotThreadSafe
- */
- public class LZ77Compressor {
- /**
- * Represents a back-reference.
- */
- public static final class BackReference extends Block {
- private final int offset, length;
- public BackReference(final int offset, final int length) {
- this.offset = offset;
- this.length = length;
- }
- /**
- * Provides the length of the back-reference.
- *
- * @return the length
- */
- public int getLength() {
- return length;
- }
- /**
- * Provides the offset of the back-reference.
- *
- * @return the offset
- */
- public int getOffset() {
- return offset;
- }
- @Override
- public BlockType getType() {
- return BlockType.BACK_REFERENCE;
- }
- @Override
- public String toString() {
- return "BackReference with offset " + offset + " and length " + length;
- }
- }
- /**
- * Base class representing blocks the compressor may emit.
- *
- * <p>
- * This class is not supposed to be subclassed by classes outside of Commons Compress so it is considered internal and changed that would break subclasses
- * may get introduced with future releases.
- * </p>
- */
- public abstract static class Block {
- /** Enumeration of the block types the compressor may emit. */
- public enum BlockType {
- LITERAL, BACK_REFERENCE, EOD
- }
- public abstract BlockType getType();
- }
- /**
- * Callback invoked while the compressor processes data.
- *
- * <p>
- * The callback is invoked on the same thread that receives the bytes to compress and may be invoked multiple times during the execution of
- * {@link #compress} or {@link #finish}.
- * </p>
- */
- public interface Callback {
- /**
- * Consumes a block.
- *
- * @param b the block to consume
- * @throws IOException in case of an error
- */
- void accept(Block b) throws IOException;
- }
- /** A simple "we are done" marker. */
- public static final class EOD extends Block {
- @Override
- public BlockType getType() {
- return BlockType.EOD;
- }
- }
- /**
- * Represents a literal block of data.
- *
- * <p>
- * For performance reasons this encapsulates the real data, not a copy of it. Don't modify the data and process it inside of {@link Callback#accept}
- * immediately as it will get overwritten sooner or later.
- * </p>
- */
- public static final class LiteralBlock extends Block {
- private final byte[] data;
- private final int offset, length;
- public LiteralBlock(final byte[] data, final int offset, final int length) {
- this.data = data;
- this.offset = offset;
- this.length = length;
- }
- /**
- * The literal data.
- *
- * <p>
- * This returns a life view of the actual data in order to avoid copying, modify the array at your own risk.
- * </p>
- *
- * @return the data
- */
- public byte[] getData() {
- return data;
- }
- /**
- * Length of literal block.
- *
- * @return the length
- */
- public int getLength() {
- return length;
- }
- /**
- * Offset into data where the literal block starts.
- *
- * @return the offset
- */
- public int getOffset() {
- return offset;
- }
- @Override
- public BlockType getType() {
- return BlockType.LITERAL;
- }
- @Override
- public String toString() {
- return "LiteralBlock starting at " + offset + " with length " + length;
- }
- }
- private static final Block THE_EOD = new EOD();
- static final int NUMBER_OF_BYTES_IN_HASH = 3;
- private static final int NO_MATCH = -1;
- // we use a 15 bit hash code as calculated in updateHash
- private static final int HASH_SIZE = 1 << 15;
- private static final int HASH_MASK = HASH_SIZE - 1;
- private static final int H_SHIFT = 5;
- private final Parameters params;
- private final Callback callback;
- // the sliding window, twice as big as "windowSize" parameter
- private final byte[] window;
- // the head of hash-chain - indexed by hash-code, points to the
- // location inside of window of the latest sequence of bytes with
- // the given hash.
- private final int[] head;
- // for each window-location points to the latest earlier location
- // with the same hash. Only stores values for the latest
- // "windowSize" elements, the index is "window location modulo
- // windowSize".
- private final int[] prev;
- // bit mask used when indexing into prev
- private final int wMask;
- private boolean initialized;
- // the position inside of window that shall be encoded right now
- private int currentPosition;
- // the number of bytes available to compress including the one at
- // currentPosition
- private int lookahead;
- // the hash of the three bytes stating at the current position
- private int insertHash;
- // the position inside the window where the current literal
- // block starts (in case we are inside a literal block).
- private int blockStart;
- // position of the current match
- private int matchStart = NO_MATCH;
- // number of missed insertString calls for the up to three last
- // bytes of the last match that can only be performed once more
- // data has been read
- private int missedInserts;
- /**
- * Initializes a compressor with parameters and a callback.
- *
- * @param params the parameters
- * @param callback the callback
- * @throws NullPointerException if either parameter is {@code null}
- */
- public LZ77Compressor(final Parameters params, final Callback callback) {
- Objects.requireNonNull(params, "params");
- Objects.requireNonNull(callback, "callback");
- this.params = params;
- this.callback = callback;
- final int wSize = params.getWindowSize();
- window = new byte[wSize * 2];
- wMask = wSize - 1;
- head = ArrayFill.fill(new int[HASH_SIZE], NO_MATCH);
- prev = new int[wSize];
- }
- private void catchUpMissedInserts() {
- while (missedInserts > 0) {
- insertString(currentPosition - missedInserts--);
- }
- }
- private void compress() throws IOException {
- final int minMatch = params.getMinBackReferenceLength();
- final boolean lazy = params.getLazyMatching();
- final int lazyThreshold = params.getLazyMatchingThreshold();
- while (lookahead >= minMatch) {
- catchUpMissedInserts();
- int matchLength = 0;
- final int hashHead = insertString(currentPosition);
- if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) {
- // sets matchStart as a side effect
- matchLength = longestMatch(hashHead);
- if (lazy && matchLength <= lazyThreshold && lookahead > minMatch) {
- // try to find a longer match using the next position
- matchLength = longestMatchForNextPosition(matchLength);
- }
- }
- if (matchLength >= minMatch) {
- if (blockStart != currentPosition) {
- // emit preceding literal block
- flushLiteralBlock();
- blockStart = NO_MATCH;
- }
- flushBackReference(matchLength);
- insertStringsInMatch(matchLength);
- lookahead -= matchLength;
- currentPosition += matchLength;
- blockStart = currentPosition;
- } else {
- // no match, append to current or start a new literal
- lookahead--;
- currentPosition++;
- if (currentPosition - blockStart >= params.getMaxLiteralLength()) {
- flushLiteralBlock();
- blockStart = currentPosition;
- }
- }
- }
- }
- /**
- * Feeds bytes into the compressor which in turn may emit zero or more blocks to the callback during the execution of this method.
- *
- * @param data the data to compress - must not be null
- * @throws IOException if the callback throws an exception
- */
- public void compress(final byte[] data) throws IOException {
- compress(data, 0, data.length);
- }
- /**
- * Feeds bytes into the compressor which in turn may emit zero or more blocks to the callback during the execution of this method.
- *
- * @param data the data to compress - must not be null
- * @param off the start offset of the data
- * @param len the number of bytes to compress
- * @throws IOException if the callback throws an exception
- */
- public void compress(final byte[] data, int off, int len) throws IOException {
- final int wSize = params.getWindowSize();
- while (len > wSize) { // chop into windowSize sized chunks
- doCompress(data, off, wSize);
- off += wSize;
- len -= wSize;
- }
- if (len > 0) {
- doCompress(data, off, len);
- }
- }
- // performs the actual algorithm with the pre-condition len <= windowSize
- private void doCompress(final byte[] data, final int off, final int len) throws IOException {
- final int spaceLeft = window.length - currentPosition - lookahead;
- if (len > spaceLeft) {
- slide();
- }
- System.arraycopy(data, off, window, currentPosition + lookahead, len);
- lookahead += len;
- if (!initialized && lookahead >= params.getMinBackReferenceLength()) {
- initialize();
- }
- if (initialized) {
- compress();
- }
- }
- /**
- * Tells the compressor to process all remaining data and signal end of data to the callback.
- *
- * <p>
- * The compressor will in turn emit at least one block ({@link EOD}) but potentially multiple blocks to the callback during the execution of this method.
- * </p>
- *
- * @throws IOException if the callback throws an exception
- */
- public void finish() throws IOException {
- if (blockStart != currentPosition || lookahead > 0) {
- currentPosition += lookahead;
- flushLiteralBlock();
- }
- callback.accept(THE_EOD);
- }
- private void flushBackReference(final int matchLength) throws IOException {
- callback.accept(new BackReference(currentPosition - matchStart, matchLength));
- }
- private void flushLiteralBlock() throws IOException {
- callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart));
- }
- private void initialize() {
- for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) {
- insertHash = nextHash(insertHash, window[i]);
- }
- initialized = true;
- }
- /**
- * Inserts the current three byte sequence into the dictionary and returns the previous head of the hash-chain.
- *
- * <p>
- * Updates {@code insertHash} and {@code prev} as a side effect.
- * </p>
- */
- private int insertString(final int pos) {
- insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]);
- final int hashHead = head[insertHash];
- prev[pos & wMask] = hashHead;
- head[insertHash] = pos;
- return hashHead;
- }
- private void insertStringsInMatch(final int matchLength) {
- // inserts strings contained in current match
- // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts
- final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH);
- // currentPosition has been inserted already
- for (int i = 1; i <= stop; i++) {
- insertString(currentPosition + i);
- }
- missedInserts = matchLength - stop - 1;
- }
- /**
- * Searches the hash chain for real matches and returns the length of the longest match (0 if none were found) that isn't too far away (WRT maxOffset).
- *
- * <p>
- * Sets matchStart to the index of the start position of the longest match as a side effect.
- * </p>
- */
- private int longestMatch(int matchHead) {
- final int minLength = params.getMinBackReferenceLength();
- int longestMatchLength = minLength - 1;
- final int maxPossibleLength = Math.min(params.getMaxBackReferenceLength(), lookahead);
- final int minIndex = Math.max(0, currentPosition - params.getMaxOffset());
- final int niceBackReferenceLength = Math.min(maxPossibleLength, params.getNiceBackReferenceLength());
- final int maxCandidates = params.getMaxCandidates();
- for (int candidates = 0; candidates < maxCandidates && matchHead >= minIndex; candidates++) {
- int currentLength = 0;
- for (int i = 0; i < maxPossibleLength; i++) {
- if (window[matchHead + i] != window[currentPosition + i]) {
- break;
- }
- currentLength++;
- }
- if (currentLength > longestMatchLength) {
- longestMatchLength = currentLength;
- matchStart = matchHead;
- if (currentLength >= niceBackReferenceLength) {
- // no need to search any further
- break;
- }
- }
- matchHead = prev[matchHead & wMask];
- }
- return longestMatchLength; // < minLength if no matches have been found, will be ignored in compress()
- }
- private int longestMatchForNextPosition(final int prevMatchLength) {
- // save a bunch of values to restore them if the next match isn't better than the current one
- final int prevMatchStart = matchStart;
- final int prevInsertHash = insertHash;
- lookahead--;
- currentPosition++;
- final int hashHead = insertString(currentPosition);
- final int prevHashHead = prev[currentPosition & wMask];
- int matchLength = longestMatch(hashHead);
- if (matchLength <= prevMatchLength) {
- // use the first match, as the next one isn't any better
- matchLength = prevMatchLength;
- matchStart = prevMatchStart;
- // restore modified values
- head[insertHash] = prevHashHead;
- insertHash = prevInsertHash;
- currentPosition--;
- lookahead++;
- }
- return matchLength;
- }
- /**
- * Assumes we are calculating the hash for three consecutive bytes as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC the new hash for BCD is
- * nextHash(H, D).
- *
- * <p>
- * The hash is shifted by five bits on each update so all effects of A have been swapped after the third update.
- * </p>
- */
- private int nextHash(final int oldHash, final byte nextByte) {
- final int nextVal = nextByte & 0xFF;
- return (oldHash << H_SHIFT ^ nextVal) & HASH_MASK;
- }
- /**
- * Adds some initial data to fill the window with.
- *
- * <p>
- * This is used if the stream has been cut into blocks and back-references of one block may refer to data of the previous block(s). One such example is the
- * LZ4 frame format using block dependency.
- * </p>
- *
- * @param data the data to fill the window with.
- * @throws IllegalStateException if the compressor has already started to accept data
- */
- public void prefill(final byte[] data) {
- if (currentPosition != 0 || lookahead != 0) {
- throw new IllegalStateException("The compressor has already started to accept data, can't prefill anymore");
- }
- // don't need more than windowSize for back-references
- final int len = Math.min(params.getWindowSize(), data.length);
- System.arraycopy(data, data.length - len, window, 0, len);
- if (len >= NUMBER_OF_BYTES_IN_HASH) {
- initialize();
- final int stop = len - NUMBER_OF_BYTES_IN_HASH + 1;
- for (int i = 0; i < stop; i++) {
- insertString(i);
- }
- missedInserts = NUMBER_OF_BYTES_IN_HASH - 1;
- } else { // not enough data to hash anything
- missedInserts = len;
- }
- blockStart = currentPosition = len;
- }
- private void slide() throws IOException {
- final int wSize = params.getWindowSize();
- if (blockStart != currentPosition && blockStart < wSize) {
- flushLiteralBlock();
- blockStart = currentPosition;
- }
- System.arraycopy(window, wSize, window, 0, wSize);
- currentPosition -= wSize;
- matchStart -= wSize;
- blockStart -= wSize;
- for (int i = 0; i < HASH_SIZE; i++) {
- final int h = head[i];
- head[i] = h >= wSize ? h - wSize : NO_MATCH;
- }
- for (int i = 0; i < wSize; i++) {
- final int p = prev[i];
- prev[i] = p >= wSize ? p - wSize : NO_MATCH;
- }
- }
- }