View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   https://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.commons.compress.compressors.lz77support;
20  
21  import java.io.IOException;
22  import java.io.InputStream;
23  import java.util.Arrays;
24  
25  import org.apache.commons.compress.compressors.CompressorInputStream;
26  import org.apache.commons.compress.utils.ByteUtils;
27  import org.apache.commons.compress.utils.IOUtils;
28  import org.apache.commons.compress.utils.InputStreamStatistics;
29  import org.apache.commons.io.input.BoundedInputStream;
30  
31  /**
32   * Encapsulates code common to LZ77 decompressors.
33   *
34   * <p>
35   * Assumes the stream consists of blocks of literal data and back-references (called copies) in any order. Of course the first block must be a literal block for
36   * the scheme to work - unless the {@link #prefill prefill} method has been used to provide initial data that is never returned by {@link #read read} but only
37   * used for back-references.
38   * </p>
39   *
40   * <p>
41   * Subclasses must override the three-arg {@link #read read} method as the no-arg version delegates to it and the default implementation delegates to the no-arg
42   * version, leading to infinite mutual recursion and a {@code StackOverflowError} otherwise.
43   * </p>
44   *
45   * <p>
46   * The contract for subclasses' {@code read} implementation is:
47   * </p>
48   * <ul>
49   *
50   * <li>keep track of the current state of the stream. Is it inside a literal block or a back-reference or in-between blocks?</li>
51   *
52   * <li>Use {@link #readOneByte} to access the underlying stream directly.</li>
53   *
54   * <li>If a new literal block starts, use {@link #startLiteral} to tell this class about it and read the literal data using {@link #readLiteral} until it
55   * returns {@code 0}. {@link #hasMoreDataInBlock} will return {@code false} before the next call to {@link #readLiteral} would return {@code 0}.</li>
56   *
57   * <li>If a new back-reference starts, use {@link #startBackReference} to tell this class about it and read the literal data using {@link #readBackReference}
58   * until it returns {@code 0}. {@link #hasMoreDataInBlock} will return {@code false} before the next call to {@link #readBackReference} would return
59   * {@code 0}.</li>
60   *
61   * <li>If the end of the stream has been reached, return {@code -1} as this class' methods will never do so themselves.</li>
62   *
63   * </ul>
64   *
65   * <p>
66   * {@link #readOneByte} and {@link #readLiteral} update the counter for bytes read.
67   * </p>
68   *
69   * @since 1.14
70   */
71  public abstract class AbstractLZ77CompressorInputStream extends CompressorInputStream implements InputStreamStatistics {
72  
73      /** Size of the window - must be bigger than the biggest offset expected. */
74      private final int windowSize;
75  
76      /**
77       * Buffer to write decompressed bytes to for back-references, will be three times windowSize big.
78       *
79       * <p>
80       * Three times so we can slide the whole buffer a windowSize to the left once we've read twice windowSize and still have enough data inside of it to satisfy
81       * back-references.
82       * </p>
83       */
84      private final byte[] buf;
85  
86      /** One behind the index of the last byte in the buffer that was written, i.e. the next position to write to */
87      private int writeIndex;
88  
89      /** Index of the next byte to be read. */
90      private int readIndex;
91  
92      /** The underlying stream to read compressed data from */
93      private final BoundedInputStream in;
94  
95      /** Number of bytes still to be read from the current literal or back-reference. */
96      private long bytesRemaining;
97  
98      /** Offset of the current back-reference. */
99      private int backReferenceOffset;
100 
101     /** Uncompressed size */
102     private int size;
103 
104     // used in no-arg read method
105     private final byte[] oneByte = new byte[1];
106 
107     /**
108      * Supplier that delegates to {@link #readOneByte}.
109      */
110     protected final ByteUtils.ByteSupplier supplier = this::readOneByte;
111 
112     /**
113      * Creates a new LZ77 input stream.
114      *
115      * @param is         An InputStream to read compressed data from
116      * @param windowSize Size of the window kept for back-references, must be bigger than the biggest offset expected.
117      * @throws IllegalArgumentException if windowSize is not bigger than 0
118      */
119     public AbstractLZ77CompressorInputStream(final InputStream is, final int windowSize) {
120         this.in = BoundedInputStream.builder().setInputStream(is).asSupplier().get();
121         if (windowSize <= 0) {
122             throw new IllegalArgumentException("windowSize must be bigger than 0");
123         }
124         this.windowSize = windowSize;
125         buf = new byte[3 * windowSize];
126         writeIndex = readIndex = 0;
127         bytesRemaining = 0;
128     }
129 
130     /** {@inheritDoc} */
131     @Override
132     public int available() {
133         return writeIndex - readIndex;
134     }
135 
136     /** {@inheritDoc} */
137     @Override
138     public void close() throws IOException {
139         in.close();
140     }
141 
142     /**
143      * @since 1.17
144      */
145     @Override
146     public long getCompressedCount() {
147         return in.getCount();
148     }
149 
150     /**
151      * Gets the uncompressed size of the stream
152      *
153      * @return the uncompressed size
154      */
155     public int getSize() {
156         return size;
157     }
158 
159     /**
160      * Is there still data remaining inside the current block?
161      *
162      * @return true if there is still data remaining inside the current block.
163      */
164     protected final boolean hasMoreDataInBlock() {
165         return bytesRemaining > 0;
166     }
167 
168     /**
169      * Adds some initial data to fill the window with.
170      *
171      * <p>
172      * This is used if the stream has been cut into blocks and back-references of one block may refer to data of the previous block(s). One such example is the
173      * LZ4 frame format using block dependency.
174      * </p>
175      *
176      * @param data the data to fill the window with.
177      * @throws IllegalStateException if the stream has already started to read data
178      */
179     public void prefill(final byte[] data) {
180         if (writeIndex != 0) {
181             throw new IllegalStateException("The stream has already been read from, can't prefill anymore");
182         }
183         // we don't need more data than the big offset could refer to, so cap it
184         final int len = Math.min(windowSize, data.length);
185         // we need the last data as we are dealing with *back*-references
186         System.arraycopy(data, data.length - len, buf, 0, len);
187         writeIndex += len;
188         readIndex += len;
189     }
190 
191     /** {@inheritDoc} */
192     @Override
193     public int read() throws IOException {
194         return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
195     }
196 
197     /**
198      * Reads data from the current back-reference.
199      *
200      * @param b   buffer to write data to
201      * @param off offset to start writing to
202      * @param len maximum amount of data to read
203      * @return number of bytes read, may be 0. Will never return -1 as EOF-detection is the responsibility of the subclass
204      * @throws NullPointerException      if {@code b} is null
205      * @throws IndexOutOfBoundsException if {@code off} is negative, {@code len} is negative, or {@code len} is greater than {@code b.length - off}
206      */
207     protected final int readBackReference(final byte[] b, final int off, final int len) {
208         final int avail = available();
209         if (len > avail) {
210             tryToCopy(len - avail);
211         }
212         return readFromBuffer(b, off, len);
213     }
214 
215     private int readFromBuffer(final byte[] b, final int off, final int len) {
216         final int readable = Math.min(len, available());
217         if (readable > 0) {
218             System.arraycopy(buf, readIndex, b, off, readable);
219             readIndex += readable;
220             if (readIndex > 2 * windowSize) {
221                 slideBuffer();
222             }
223         }
224         size += readable;
225         return readable;
226     }
227 
228     /**
229      * Reads data from the current literal block.
230      *
231      * @param b   buffer to write data to
232      * @param off offset to start writing to
233      * @param len maximum amount of data to read
234      * @return number of bytes read, may be 0. Will never return -1 as EOF-detection is the responsibility of the subclass
235      * @throws IOException               if the underlying stream throws or signals an EOF before the amount of data promised for the block have been read
236      * @throws NullPointerException      if {@code b} is null
237      * @throws IndexOutOfBoundsException if {@code off} is negative, {@code len} is negative, or {@code len} is greater than {@code b.length - off}
238      */
239     protected final int readLiteral(final byte[] b, final int off, final int len) throws IOException {
240         final int avail = available();
241         if (len > avail) {
242             tryToReadLiteral(len - avail);
243         }
244         return readFromBuffer(b, off, len);
245     }
246 
247     /**
248      * Reads a single byte from the real input stream and ensures the data is accounted for.
249      *
250      * @return the byte read as value between 0 and 255 or -1 if EOF has been reached.
251      * @throws IOException if the underlying stream throws
252      */
253     protected final int readOneByte() throws IOException {
254         final int b = in.read();
255         if (b != -1) {
256             count(1);
257             return b & 0xFF;
258         }
259         return -1;
260     }
261 
262     private void slideBuffer() {
263         System.arraycopy(buf, windowSize, buf, 0, windowSize * 2);
264         writeIndex -= windowSize;
265         readIndex -= windowSize;
266     }
267 
268     /**
269      * Used by subclasses to signal the next block contains a back-reference with the given coordinates.
270      *
271      * @param offset the offset of the back-reference
272      * @param length the length of the back-reference
273      * @throws IllegalArgumentException if offset not bigger than 0 or bigger than the number of bytes available for back-references or if length is negative
274      */
275     protected final void startBackReference(final int offset, final long length) {
276         if (offset <= 0 || offset > writeIndex) {
277             throw new IllegalArgumentException("offset must be bigger than 0 but not bigger than the number of bytes available for back-references");
278         }
279         if (length < 0) {
280             throw new IllegalArgumentException("length must not be negative");
281         }
282         backReferenceOffset = offset;
283         bytesRemaining = length;
284     }
285 
286     /**
287      * Used by subclasses to signal the next block contains the given amount of literal data.
288      *
289      * @param length the length of the block
290      * @throws IllegalArgumentException if length is negative
291      */
292     protected final void startLiteral(final long length) {
293         if (length < 0) {
294             throw new IllegalArgumentException("length must not be negative");
295         }
296         bytesRemaining = length;
297     }
298 
299     private void tryToCopy(final int bytesToCopy) {
300         // this will fit into the buffer without sliding and not
301         // require more than is available inside the back-reference
302         final int copy = Math.min((int) Math.min(bytesToCopy, bytesRemaining), buf.length - writeIndex);
303         if (copy == 0) {
304             // NOP
305         } else if (backReferenceOffset == 1) { // pretty common special case
306             final byte last = buf[writeIndex - 1];
307             Arrays.fill(buf, writeIndex, writeIndex + copy, last);
308             writeIndex += copy;
309         } else if (copy < backReferenceOffset) {
310             System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, copy);
311             writeIndex += copy;
312         } else {
313             // back-reference overlaps with the bytes created from it
314             // like go back two bytes and then copy six (by copying
315             // the last two bytes three time).
316             final int fullRots = copy / backReferenceOffset;
317             for (int i = 0; i < fullRots; i++) {
318                 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, backReferenceOffset);
319                 writeIndex += backReferenceOffset;
320             }
321 
322             final int pad = copy - backReferenceOffset * fullRots;
323             if (pad > 0) {
324                 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, pad);
325                 writeIndex += pad;
326             }
327         }
328         bytesRemaining -= copy;
329     }
330 
331     private void tryToReadLiteral(final int bytesToRead) throws IOException {
332         // min of "what is still inside the literal", "what does the user want" and "how much can fit into the buffer"
333         final int reallyTryToRead = Math.min((int) Math.min(bytesToRead, bytesRemaining), buf.length - writeIndex);
334         final int bytesRead = reallyTryToRead > 0 ? IOUtils.readFully(in, buf, writeIndex, reallyTryToRead) : 0 /* happens for bytesRemaining == 0 */;
335         count(bytesRead);
336         if (reallyTryToRead != bytesRead) {
337             throw new IOException("Premature end of stream reading literal");
338         }
339         writeIndex += reallyTryToRead;
340         bytesRemaining -= reallyTryToRead;
341     }
342 }