1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * https://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19 package org.apache.commons.compress.compressors.lz77support;
20
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.util.Arrays;
24
25 import org.apache.commons.compress.compressors.CompressorInputStream;
26 import org.apache.commons.compress.utils.ByteUtils;
27 import org.apache.commons.compress.utils.IOUtils;
28 import org.apache.commons.compress.utils.InputStreamStatistics;
29 import org.apache.commons.io.input.BoundedInputStream;
30
31 /**
32 * Encapsulates code common to LZ77 decompressors.
33 *
34 * <p>
35 * Assumes the stream consists of blocks of literal data and back-references (called copies) in any order. Of course the first block must be a literal block for
36 * the scheme to work - unless the {@link #prefill prefill} method has been used to provide initial data that is never returned by {@link #read read} but only
37 * used for back-references.
38 * </p>
39 *
40 * <p>
41 * Subclasses must override the three-arg {@link #read read} method as the no-arg version delegates to it and the default implementation delegates to the no-arg
42 * version, leading to infinite mutual recursion and a {@code StackOverflowError} otherwise.
43 * </p>
44 *
45 * <p>
46 * The contract for subclasses' {@code read} implementation is:
47 * </p>
48 * <ul>
49 *
50 * <li>keep track of the current state of the stream. Is it inside a literal block or a back-reference or in-between blocks?</li>
51 *
52 * <li>Use {@link #readOneByte} to access the underlying stream directly.</li>
53 *
54 * <li>If a new literal block starts, use {@link #startLiteral} to tell this class about it and read the literal data using {@link #readLiteral} until it
55 * returns {@code 0}. {@link #hasMoreDataInBlock} will return {@code false} before the next call to {@link #readLiteral} would return {@code 0}.</li>
56 *
57 * <li>If a new back-reference starts, use {@link #startBackReference} to tell this class about it and read the literal data using {@link #readBackReference}
58 * until it returns {@code 0}. {@link #hasMoreDataInBlock} will return {@code false} before the next call to {@link #readBackReference} would return
59 * {@code 0}.</li>
60 *
61 * <li>If the end of the stream has been reached, return {@code -1} as this class' methods will never do so themselves.</li>
62 *
63 * </ul>
64 *
65 * <p>
66 * {@link #readOneByte} and {@link #readLiteral} update the counter for bytes read.
67 * </p>
68 *
69 * @since 1.14
70 */
71 public abstract class AbstractLZ77CompressorInputStream extends CompressorInputStream implements InputStreamStatistics {
72
73 /** Size of the window - must be bigger than the biggest offset expected. */
74 private final int windowSize;
75
76 /**
77 * Buffer to write decompressed bytes to for back-references, will be three times windowSize big.
78 *
79 * <p>
80 * Three times so we can slide the whole buffer a windowSize to the left once we've read twice windowSize and still have enough data inside of it to satisfy
81 * back-references.
82 * </p>
83 */
84 private final byte[] buf;
85
86 /** One behind the index of the last byte in the buffer that was written, i.e. the next position to write to */
87 private int writeIndex;
88
89 /** Index of the next byte to be read. */
90 private int readIndex;
91
92 /** The underlying stream to read compressed data from */
93 private final BoundedInputStream in;
94
95 /** Number of bytes still to be read from the current literal or back-reference. */
96 private long bytesRemaining;
97
98 /** Offset of the current back-reference. */
99 private int backReferenceOffset;
100
101 /** Uncompressed size */
102 private int size;
103
104 // used in no-arg read method
105 private final byte[] oneByte = new byte[1];
106
107 /**
108 * Supplier that delegates to {@link #readOneByte}.
109 */
110 protected final ByteUtils.ByteSupplier supplier = this::readOneByte;
111
112 /**
113 * Creates a new LZ77 input stream.
114 *
115 * @param is An InputStream to read compressed data from
116 * @param windowSize Size of the window kept for back-references, must be bigger than the biggest offset expected.
117 * @throws IllegalArgumentException if windowSize is not bigger than 0
118 */
119 public AbstractLZ77CompressorInputStream(final InputStream is, final int windowSize) {
120 this.in = BoundedInputStream.builder().setInputStream(is).asSupplier().get();
121 if (windowSize <= 0) {
122 throw new IllegalArgumentException("windowSize must be bigger than 0");
123 }
124 this.windowSize = windowSize;
125 buf = new byte[3 * windowSize];
126 writeIndex = readIndex = 0;
127 bytesRemaining = 0;
128 }
129
130 /** {@inheritDoc} */
131 @Override
132 public int available() {
133 return writeIndex - readIndex;
134 }
135
136 /** {@inheritDoc} */
137 @Override
138 public void close() throws IOException {
139 in.close();
140 }
141
142 /**
143 * @since 1.17
144 */
145 @Override
146 public long getCompressedCount() {
147 return in.getCount();
148 }
149
150 /**
151 * Gets the uncompressed size of the stream
152 *
153 * @return the uncompressed size
154 */
155 public int getSize() {
156 return size;
157 }
158
159 /**
160 * Is there still data remaining inside the current block?
161 *
162 * @return true if there is still data remaining inside the current block.
163 */
164 protected final boolean hasMoreDataInBlock() {
165 return bytesRemaining > 0;
166 }
167
168 /**
169 * Adds some initial data to fill the window with.
170 *
171 * <p>
172 * This is used if the stream has been cut into blocks and back-references of one block may refer to data of the previous block(s). One such example is the
173 * LZ4 frame format using block dependency.
174 * </p>
175 *
176 * @param data the data to fill the window with.
177 * @throws IllegalStateException if the stream has already started to read data
178 */
179 public void prefill(final byte[] data) {
180 if (writeIndex != 0) {
181 throw new IllegalStateException("The stream has already been read from, can't prefill anymore");
182 }
183 // we don't need more data than the big offset could refer to, so cap it
184 final int len = Math.min(windowSize, data.length);
185 // we need the last data as we are dealing with *back*-references
186 System.arraycopy(data, data.length - len, buf, 0, len);
187 writeIndex += len;
188 readIndex += len;
189 }
190
191 /** {@inheritDoc} */
192 @Override
193 public int read() throws IOException {
194 return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
195 }
196
197 /**
198 * Reads data from the current back-reference.
199 *
200 * @param b buffer to write data to
201 * @param off offset to start writing to
202 * @param len maximum amount of data to read
203 * @return number of bytes read, may be 0. Will never return -1 as EOF-detection is the responsibility of the subclass
204 * @throws NullPointerException if {@code b} is null
205 * @throws IndexOutOfBoundsException if {@code off} is negative, {@code len} is negative, or {@code len} is greater than {@code b.length - off}
206 */
207 protected final int readBackReference(final byte[] b, final int off, final int len) {
208 final int avail = available();
209 if (len > avail) {
210 tryToCopy(len - avail);
211 }
212 return readFromBuffer(b, off, len);
213 }
214
215 private int readFromBuffer(final byte[] b, final int off, final int len) {
216 final int readable = Math.min(len, available());
217 if (readable > 0) {
218 System.arraycopy(buf, readIndex, b, off, readable);
219 readIndex += readable;
220 if (readIndex > 2 * windowSize) {
221 slideBuffer();
222 }
223 }
224 size += readable;
225 return readable;
226 }
227
228 /**
229 * Reads data from the current literal block.
230 *
231 * @param b buffer to write data to
232 * @param off offset to start writing to
233 * @param len maximum amount of data to read
234 * @return number of bytes read, may be 0. Will never return -1 as EOF-detection is the responsibility of the subclass
235 * @throws IOException if the underlying stream throws or signals an EOF before the amount of data promised for the block have been read
236 * @throws NullPointerException if {@code b} is null
237 * @throws IndexOutOfBoundsException if {@code off} is negative, {@code len} is negative, or {@code len} is greater than {@code b.length - off}
238 */
239 protected final int readLiteral(final byte[] b, final int off, final int len) throws IOException {
240 final int avail = available();
241 if (len > avail) {
242 tryToReadLiteral(len - avail);
243 }
244 return readFromBuffer(b, off, len);
245 }
246
247 /**
248 * Reads a single byte from the real input stream and ensures the data is accounted for.
249 *
250 * @return the byte read as value between 0 and 255 or -1 if EOF has been reached.
251 * @throws IOException if the underlying stream throws
252 */
253 protected final int readOneByte() throws IOException {
254 final int b = in.read();
255 if (b != -1) {
256 count(1);
257 return b & 0xFF;
258 }
259 return -1;
260 }
261
262 private void slideBuffer() {
263 System.arraycopy(buf, windowSize, buf, 0, windowSize * 2);
264 writeIndex -= windowSize;
265 readIndex -= windowSize;
266 }
267
268 /**
269 * Used by subclasses to signal the next block contains a back-reference with the given coordinates.
270 *
271 * @param offset the offset of the back-reference
272 * @param length the length of the back-reference
273 * @throws IllegalArgumentException if offset not bigger than 0 or bigger than the number of bytes available for back-references or if length is negative
274 */
275 protected final void startBackReference(final int offset, final long length) {
276 if (offset <= 0 || offset > writeIndex) {
277 throw new IllegalArgumentException("offset must be bigger than 0 but not bigger than the number of bytes available for back-references");
278 }
279 if (length < 0) {
280 throw new IllegalArgumentException("length must not be negative");
281 }
282 backReferenceOffset = offset;
283 bytesRemaining = length;
284 }
285
286 /**
287 * Used by subclasses to signal the next block contains the given amount of literal data.
288 *
289 * @param length the length of the block
290 * @throws IllegalArgumentException if length is negative
291 */
292 protected final void startLiteral(final long length) {
293 if (length < 0) {
294 throw new IllegalArgumentException("length must not be negative");
295 }
296 bytesRemaining = length;
297 }
298
299 private void tryToCopy(final int bytesToCopy) {
300 // this will fit into the buffer without sliding and not
301 // require more than is available inside the back-reference
302 final int copy = Math.min((int) Math.min(bytesToCopy, bytesRemaining), buf.length - writeIndex);
303 if (copy == 0) {
304 // NOP
305 } else if (backReferenceOffset == 1) { // pretty common special case
306 final byte last = buf[writeIndex - 1];
307 Arrays.fill(buf, writeIndex, writeIndex + copy, last);
308 writeIndex += copy;
309 } else if (copy < backReferenceOffset) {
310 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, copy);
311 writeIndex += copy;
312 } else {
313 // back-reference overlaps with the bytes created from it
314 // like go back two bytes and then copy six (by copying
315 // the last two bytes three time).
316 final int fullRots = copy / backReferenceOffset;
317 for (int i = 0; i < fullRots; i++) {
318 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, backReferenceOffset);
319 writeIndex += backReferenceOffset;
320 }
321
322 final int pad = copy - backReferenceOffset * fullRots;
323 if (pad > 0) {
324 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, pad);
325 writeIndex += pad;
326 }
327 }
328 bytesRemaining -= copy;
329 }
330
331 private void tryToReadLiteral(final int bytesToRead) throws IOException {
332 // min of "what is still inside the literal", "what does the user want" and "how much can fit into the buffer"
333 final int reallyTryToRead = Math.min((int) Math.min(bytesToRead, bytesRemaining), buf.length - writeIndex);
334 final int bytesRead = reallyTryToRead > 0 ? IOUtils.readFully(in, buf, writeIndex, reallyTryToRead) : 0 /* happens for bytesRemaining == 0 */;
335 count(bytesRead);
336 if (reallyTryToRead != bytesRead) {
337 throw new IOException("Premature end of stream reading literal");
338 }
339 writeIndex += reallyTryToRead;
340 bytesRemaining -= reallyTryToRead;
341 }
342 }