AbstractLZ77CompressorInputStream.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one
  3.  * or more contributor license agreements.  See the NOTICE file
  4.  * distributed with this work for additional information
  5.  * regarding copyright ownership.  The ASF licenses this file
  6.  * to you under the Apache License, Version 2.0 (the
  7.  * "License"); you may not use this file except in compliance
  8.  * with the License.  You may obtain a copy of the License at
  9.  *
  10.  * http://www.apache.org/licenses/LICENSE-2.0
  11.  *
  12.  * Unless required by applicable law or agreed to in writing,
  13.  * software distributed under the License is distributed on an
  14.  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  15.  * KIND, either express or implied.  See the License for the
  16.  * specific language governing permissions and limitations
  17.  * under the License.
  18.  */
  19. package org.apache.commons.compress.compressors.lz77support;

  20. import java.io.IOException;
  21. import java.io.InputStream;
  22. import java.util.Arrays;

  23. import org.apache.commons.compress.compressors.CompressorInputStream;
  24. import org.apache.commons.compress.utils.ByteUtils;
  25. import org.apache.commons.compress.utils.IOUtils;
  26. import org.apache.commons.compress.utils.InputStreamStatistics;
  27. import org.apache.commons.io.input.BoundedInputStream;

  28. /**
  29.  * Encapsulates code common to LZ77 decompressors.
  30.  *
  31.  * <p>
  32.  * Assumes the stream consists of blocks of literal data and back-references (called copies) in any order. Of course the first block must be a literal block for
  33.  * the scheme to work - unless the {@link #prefill prefill} method has been used to provide initial data that is never returned by {@link #read read} but only
  34.  * used for back-references.
  35.  * </p>
  36.  *
  37.  * <p>
  38.  * Subclasses must override the three-arg {@link #read read} method as the no-arg version delegates to it and the default implementation delegates to the no-arg
  39.  * version, leading to infinite mutual recursion and a {@code StackOverflowError} otherwise.
  40.  * </p>
  41.  *
  42.  * <p>
  43.  * The contract for subclasses' {@code read} implementation is:
  44.  * </p>
  45.  * <ul>
  46.  *
  47.  * <li>keep track of the current state of the stream. Is it inside a literal block or a back-reference or in-between blocks?</li>
  48.  *
  49.  * <li>Use {@link #readOneByte} to access the underlying stream directly.</li>
  50.  *
  51.  * <li>If a new literal block starts, use {@link #startLiteral} to tell this class about it and read the literal data using {@link #readLiteral} until it
  52.  * returns {@code 0}. {@link #hasMoreDataInBlock} will return {@code false} before the next call to {@link #readLiteral} would return {@code 0}.</li>
  53.  *
  54.  * <li>If a new back-reference starts, use {@link #startBackReference} to tell this class about it and read the literal data using {@link #readBackReference}
  55.  * until it returns {@code 0}. {@link #hasMoreDataInBlock} will return {@code false} before the next call to {@link #readBackReference} would return
  56.  * {@code 0}.</li>
  57.  *
  58.  * <li>If the end of the stream has been reached, return {@code -1} as this class' methods will never do so themselves.</li>
  59.  *
  60.  * </ul>
  61.  *
  62.  * <p>
  63.  * {@link #readOneByte} and {@link #readLiteral} update the counter for bytes read.
  64.  * </p>
  65.  *
  66.  * @since 1.14
  67.  */
  68. public abstract class AbstractLZ77CompressorInputStream extends CompressorInputStream implements InputStreamStatistics {

  69.     /** Size of the window - must be bigger than the biggest offset expected. */
  70.     private final int windowSize;

  71.     /**
  72.      * Buffer to write decompressed bytes to for back-references, will be three times windowSize big.
  73.      *
  74.      * <p>
  75.      * Three times so we can slide the whole buffer a windowSize to the left once we've read twice windowSize and still have enough data inside of it to satisfy
  76.      * back-references.
  77.      * </p>
  78.      */
  79.     private final byte[] buf;

  80.     /** One behind the index of the last byte in the buffer that was written, i.e. the next position to write to */
  81.     private int writeIndex;

  82.     /** Index of the next byte to be read. */
  83.     private int readIndex;

  84.     /** The underlying stream to read compressed data from */
  85.     private final BoundedInputStream in;

  86.     /** Number of bytes still to be read from the current literal or back-reference. */
  87.     private long bytesRemaining;

  88.     /** Offset of the current back-reference. */
  89.     private int backReferenceOffset;

  90.     /** Uncompressed size */
  91.     private int size;

  92.     // used in no-arg read method
  93.     private final byte[] oneByte = new byte[1];

  94.     /**
  95.      * Supplier that delegates to {@link #readOneByte}.
  96.      */
  97.     protected final ByteUtils.ByteSupplier supplier = this::readOneByte;

  98.     /**
  99.      * Creates a new LZ77 input stream.
  100.      *
  101.      * @param is         An InputStream to read compressed data from
  102.      * @param windowSize Size of the window kept for back-references, must be bigger than the biggest offset expected.
  103.      *
  104.      * @throws IllegalArgumentException if windowSize is not bigger than 0
  105.      */
  106.     public AbstractLZ77CompressorInputStream(final InputStream is, final int windowSize) {
  107.         this.in = BoundedInputStream.builder().setInputStream(is).asSupplier().get();
  108.         if (windowSize <= 0) {
  109.             throw new IllegalArgumentException("windowSize must be bigger than 0");
  110.         }
  111.         this.windowSize = windowSize;
  112.         buf = new byte[3 * windowSize];
  113.         writeIndex = readIndex = 0;
  114.         bytesRemaining = 0;
  115.     }

  116.     /** {@inheritDoc} */
  117.     @Override
  118.     public int available() {
  119.         return writeIndex - readIndex;
  120.     }

  121.     /** {@inheritDoc} */
  122.     @Override
  123.     public void close() throws IOException {
  124.         in.close();
  125.     }

  126.     /**
  127.      * @since 1.17
  128.      */
  129.     @Override
  130.     public long getCompressedCount() {
  131.         return in.getCount();
  132.     }

  133.     /**
  134.      * Gets the uncompressed size of the stream
  135.      *
  136.      * @return the uncompressed size
  137.      */
  138.     public int getSize() {
  139.         return size;
  140.     }

  141.     /**
  142.      * Is there still data remaining inside the current block?
  143.      *
  144.      * @return true if there is still data remaining inside the current block.
  145.      */
  146.     protected final boolean hasMoreDataInBlock() {
  147.         return bytesRemaining > 0;
  148.     }

  149.     /**
  150.      * Adds some initial data to fill the window with.
  151.      *
  152.      * <p>
  153.      * This is used if the stream has been cut into blocks and back-references of one block may refer to data of the previous block(s). One such example is the
  154.      * LZ4 frame format using block dependency.
  155.      * </p>
  156.      *
  157.      * @param data the data to fill the window with.
  158.      * @throws IllegalStateException if the stream has already started to read data
  159.      */
  160.     public void prefill(final byte[] data) {
  161.         if (writeIndex != 0) {
  162.             throw new IllegalStateException("The stream has already been read from, can't prefill anymore");
  163.         }
  164.         // we don't need more data than the big offset could refer to, so cap it
  165.         final int len = Math.min(windowSize, data.length);
  166.         // we need the last data as we are dealing with *back*-references
  167.         System.arraycopy(data, data.length - len, buf, 0, len);
  168.         writeIndex += len;
  169.         readIndex += len;
  170.     }

  171.     /** {@inheritDoc} */
  172.     @Override
  173.     public int read() throws IOException {
  174.         return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
  175.     }

  176.     /**
  177.      * Reads data from the current back-reference.
  178.      *
  179.      * @param b   buffer to write data to
  180.      * @param off offset to start writing to
  181.      * @param len maximum amount of data to read
  182.      * @return number of bytes read, may be 0. Will never return -1 as EOF-detection is the responsibility of the subclass
  183.      * @throws NullPointerException      if {@code b} is null
  184.      * @throws IndexOutOfBoundsException if {@code off} is negative, {@code len} is negative, or {@code len} is greater than {@code b.length - off}
  185.      */
  186.     protected final int readBackReference(final byte[] b, final int off, final int len) {
  187.         final int avail = available();
  188.         if (len > avail) {
  189.             tryToCopy(len - avail);
  190.         }
  191.         return readFromBuffer(b, off, len);
  192.     }

  193.     private int readFromBuffer(final byte[] b, final int off, final int len) {
  194.         final int readable = Math.min(len, available());
  195.         if (readable > 0) {
  196.             System.arraycopy(buf, readIndex, b, off, readable);
  197.             readIndex += readable;
  198.             if (readIndex > 2 * windowSize) {
  199.                 slideBuffer();
  200.             }
  201.         }
  202.         size += readable;
  203.         return readable;
  204.     }

  205.     /**
  206.      * Reads data from the current literal block.
  207.      *
  208.      * @param b   buffer to write data to
  209.      * @param off offset to start writing to
  210.      * @param len maximum amount of data to read
  211.      * @return number of bytes read, may be 0. Will never return -1 as EOF-detection is the responsibility of the subclass
  212.      * @throws IOException               if the underlying stream throws or signals an EOF before the amount of data promised for the block have been read
  213.      * @throws NullPointerException      if {@code b} is null
  214.      * @throws IndexOutOfBoundsException if {@code off} is negative, {@code len} is negative, or {@code len} is greater than {@code b.length - off}
  215.      */
  216.     protected final int readLiteral(final byte[] b, final int off, final int len) throws IOException {
  217.         final int avail = available();
  218.         if (len > avail) {
  219.             tryToReadLiteral(len - avail);
  220.         }
  221.         return readFromBuffer(b, off, len);
  222.     }

  223.     /**
  224.      * Reads a single byte from the real input stream and ensures the data is accounted for.
  225.      *
  226.      * @return the byte read as value between 0 and 255 or -1 if EOF has been reached.
  227.      * @throws IOException if the underlying stream throws
  228.      */
  229.     protected final int readOneByte() throws IOException {
  230.         final int b = in.read();
  231.         if (b != -1) {
  232.             count(1);
  233.             return b & 0xFF;
  234.         }
  235.         return -1;
  236.     }

  237.     private void slideBuffer() {
  238.         System.arraycopy(buf, windowSize, buf, 0, windowSize * 2);
  239.         writeIndex -= windowSize;
  240.         readIndex -= windowSize;
  241.     }

  242.     /**
  243.      * Used by subclasses to signal the next block contains a back-reference with the given coordinates.
  244.      *
  245.      * @param offset the offset of the back-reference
  246.      * @param length the length of the back-reference
  247.      * @throws IllegalArgumentException if offset not bigger than 0 or bigger than the number of bytes available for back-references or if length is negative
  248.      */
  249.     protected final void startBackReference(final int offset, final long length) {
  250.         if (offset <= 0 || offset > writeIndex) {
  251.             throw new IllegalArgumentException("offset must be bigger than 0 but not bigger than the number" + " of bytes available for back-references");
  252.         }
  253.         if (length < 0) {
  254.             throw new IllegalArgumentException("length must not be negative");
  255.         }
  256.         backReferenceOffset = offset;
  257.         bytesRemaining = length;
  258.     }

  259.     /**
  260.      * Used by subclasses to signal the next block contains the given amount of literal data.
  261.      *
  262.      * @param length the length of the block
  263.      * @throws IllegalArgumentException if length is negative
  264.      */
  265.     protected final void startLiteral(final long length) {
  266.         if (length < 0) {
  267.             throw new IllegalArgumentException("length must not be negative");
  268.         }
  269.         bytesRemaining = length;
  270.     }

  271.     private void tryToCopy(final int bytesToCopy) {
  272.         // this will fit into the buffer without sliding and not
  273.         // require more than is available inside the back-reference
  274.         final int copy = Math.min((int) Math.min(bytesToCopy, bytesRemaining), buf.length - writeIndex);
  275.         if (copy == 0) {
  276.             // NOP
  277.         } else if (backReferenceOffset == 1) { // pretty common special case
  278.             final byte last = buf[writeIndex - 1];
  279.             Arrays.fill(buf, writeIndex, writeIndex + copy, last);
  280.             writeIndex += copy;
  281.         } else if (copy < backReferenceOffset) {
  282.             System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, copy);
  283.             writeIndex += copy;
  284.         } else {
  285.             // back-reference overlaps with the bytes created from it
  286.             // like go back two bytes and then copy six (by copying
  287.             // the last two bytes three time).
  288.             final int fullRots = copy / backReferenceOffset;
  289.             for (int i = 0; i < fullRots; i++) {
  290.                 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, backReferenceOffset);
  291.                 writeIndex += backReferenceOffset;
  292.             }

  293.             final int pad = copy - backReferenceOffset * fullRots;
  294.             if (pad > 0) {
  295.                 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, pad);
  296.                 writeIndex += pad;
  297.             }
  298.         }
  299.         bytesRemaining -= copy;
  300.     }

  301.     private void tryToReadLiteral(final int bytesToRead) throws IOException {
  302.         // min of "what is still inside the literal", "what does the user want" and "how much can fit into the buffer"
  303.         final int reallyTryToRead = Math.min((int) Math.min(bytesToRead, bytesRemaining), buf.length - writeIndex);
  304.         final int bytesRead = reallyTryToRead > 0 ? IOUtils.readFully(in, buf, writeIndex, reallyTryToRead) : 0 /* happens for bytesRemaining == 0 */;
  305.         count(bytesRead);
  306.         if (reallyTryToRead != bytesRead) {
  307.             throw new IOException("Premature end of stream reading literal");
  308.         }
  309.         writeIndex += reallyTryToRead;
  310.         bytesRemaining -= reallyTryToRead;
  311.     }
  312. }