1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * https://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19 package org.apache.commons.compress.compressors.gzip;
20
21 import java.io.BufferedInputStream;
22 import java.io.ByteArrayOutputStream;
23 import java.io.DataInput;
24 import java.io.DataInputStream;
25 import java.io.EOFException;
26 import java.io.IOException;
27 import java.io.InputStream;
28 import java.nio.charset.Charset;
29 import java.util.zip.CRC32;
30 import java.util.zip.DataFormatException;
31 import java.util.zip.Deflater;
32 import java.util.zip.Inflater;
33
34 import org.apache.commons.compress.compressors.CompressorInputStream;
35 import org.apache.commons.compress.utils.ByteUtils;
36 import org.apache.commons.compress.utils.InputStreamStatistics;
37 import org.apache.commons.io.IOUtils;
38 import org.apache.commons.io.build.AbstractOrigin;
39 import org.apache.commons.io.build.AbstractStreamBuilder;
40 import org.apache.commons.io.function.IOConsumer;
41 import org.apache.commons.io.input.BoundedInputStream;
42
43 /**
44 * Input stream that decompresses GZIP (.gz) files.
45 *
46 * <p>
47 * This supports decompressing concatenated GZIP files which is important when decompressing standalone GZIP files.
48 * </p>
49 * <p>
50 * Instead of using {@code java.util.zip.GZIPInputStream}, this class has its own GZIP member decoder. Internally, decompression is done using
51 * {@link java.util.zip.Inflater}.
52 * </p>
53 * <p>
54 * If you use the constructor {@code GzipCompressorInputStream(in)}, {@code Builder.setDecompressConcatenated(false)}, or
55 * {@code GzipCompressorInputStream(in, false)}, then {@link #read} will return -1 as soon as the first encoded GZIP member has been completely read. In this
56 * case, if the underlying input stream supports {@link InputStream#mark mark()} and {@link InputStream#reset reset()}, then it will be left positioned just
57 * after the end of the encoded GZIP member; otherwise, some indeterminate number of extra bytes following the encoded GZIP member will have been consumed and
58 * discarded.
59 * </p>
60 * <p>
61 * If you use the {@code Builder.setDecompressConcatenated(true)} or {@code GzipCompressorInputStream(in, true)} then {@link #read} will return -1 only after
62 * the entire input stream has been exhausted; any bytes that follow an encoded GZIP member must constitute a new encoded GZIP member, otherwise an
63 * {@link IOException} is thrown. The data read from a stream constructed this way will consist of the concatenated data of all of the encoded GZIP members in
64 * order.
65 * </p>
66 * <p>
67 * To build an instance, use {@link Builder}.
68 * </p>
69 *
70 * @see Builder
71 * @see <a href="https://datatracker.ietf.org/doc/html/rfc1952">RFC 1952 GZIP File Format Specification</a>
72 */
73 public class GzipCompressorInputStream extends CompressorInputStream implements InputStreamStatistics {
74
75 // @formatter:off
76 /**
77 * Builds a new {@link GzipCompressorInputStream}.
78 *
79 * <p>
80 * For example:
81 * </p>
82 * <pre>{@code
83 * GzipCompressorInputStream s = GzipCompressorInputStream.builder()
84 * .setPath(path)
85 * .setFileNameCharset(StandardCharsets.ISO_8859_1)
86 * .get();}
87 * </pre>
88 *
89 * @see #get()
90 * @since 1.28.0
91 */
92 // @formatter:on
93 public static class Builder extends AbstractStreamBuilder<GzipCompressorInputStream, Builder> {
94
95 /** True if decompressing multi-member streams. */
96 private boolean decompressConcatenated;
97
98 private Charset fileNameCharset = GzipUtils.GZIP_ENCODING;
99
100 private IOConsumer<GzipCompressorInputStream> onMemberStart;
101
102 private IOConsumer<GzipCompressorInputStream> onMemberEnd;
103
104 /**
105 * Constructs a new builder of {@link GzipCompressorInputStream}.
106 */
107 public Builder() {
108 // empty
109 }
110
111 /**
112 * Builds a new {@link GzipCompressorInputStream}.
113 * <p>
114 * You must set input that supports {@link InputStream}, otherwise, this method throws an exception.
115 * </p>
116 *
117 * @return a new instance.
118 * @throws IllegalStateException if the {@code origin} is {@code null}.
119 * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
120 * @see AbstractOrigin#getInputStream(java.nio.file.OpenOption...)
121 */
122 @Override
123 public GzipCompressorInputStream get() throws IOException {
124 return new GzipCompressorInputStream(this);
125 }
126
127 /**
128 * Sets whether we should allow decompressing multiple members.
129 *
130 * @param decompressConcatenated whether we should allow decompressing multiple members.
131 * @return this instance.
132 */
133 public Builder setDecompressConcatenated(final boolean decompressConcatenated) {
134 this.decompressConcatenated = decompressConcatenated;
135 return this;
136 }
137
138 /**
139 * Sets the Charset to use for writing file names and comments, where null maps to {@link GzipUtils#GZIP_ENCODING}.
140 * <p>
141 * <em>Setting a value other than {@link GzipUtils#GZIP_ENCODING} is not compliant with the <a href="https://datatracker.ietf.org/doc/html/rfc1952">RFC
142 * 1952 GZIP File Format Specification</a></em>. Use at your own risk of interoperability issues.
143 * </p>
144 * <p>
145 * The default value is {@link GzipUtils#GZIP_ENCODING}.
146 * </p>
147 *
148 * @param fileNameCharset the Charset to use for writing file names and comments, null maps to {@link GzipUtils#GZIP_ENCODING}.
149 * @return this instance.
150 */
151 public Builder setFileNameCharset(final Charset fileNameCharset) {
152 this.fileNameCharset = fileNameCharset;
153 return this;
154 }
155
156 /**
157 * Sets the consumer called when a member <em>trailer</em> is parsed.
158 * <p>
159 * When a member <em>header</em> is parsed, all {@link GzipParameters} values are initialized except {@code trailerCrc} and {@code trailerISize}.
160 * </p>
161 * <p>
162 * When a member <em>trailer</em> is parsed, the {@link GzipParameters} values {@code trailerCrc} and {@code trailerISize} are set.
163 * </p>
164 *
165 * @param onMemberEnd The consumer.
166 * @return this instance.
167 * @see GzipCompressorInputStream#getMetaData()
168 */
169 public Builder setOnMemberEnd(final IOConsumer<GzipCompressorInputStream> onMemberEnd) {
170 this.onMemberEnd = onMemberEnd;
171 return this;
172 }
173
174 /**
175 * Sets the consumer called when a member <em>header</em> is parsed.
176 * <p>
177 * When a member <em>header</em> is parsed, all {@link GzipParameters} values are initialized except {@code trailerCrc} and {@code trailerISize}.
178 * </p>
179 * <p>
180 * When a member <em>trailer</em> is parsed, the {@link GzipParameters} values {@code trailerCrc} and {@code trailerISize} are set.
181 * </p>
182 *
183 * @param onMemberStart The consumer.
184 * @return this instance.
185 * @see GzipCompressorInputStream#getMetaData()
186 */
187 public Builder setOnMemberStart(final IOConsumer<GzipCompressorInputStream> onMemberStart) {
188 this.onMemberStart = onMemberStart;
189 return this;
190 }
191 }
192
193 private static final IOConsumer<GzipCompressorInputStream> NOOP = IOConsumer.noop();
194
195 /**
196 * Constructs a new builder of {@link GzipCompressorInputStream}.
197 *
198 * @return a new builder of {@link GzipCompressorInputStream}.
199 * @since 1.28.0
200 */
201 public static Builder builder() {
202 return new Builder();
203 }
204
205 /**
206 * Checks if the signature matches what is expected for a .gz file.
207 *
208 * @param signature the bytes to check
209 * @param length the number of bytes to check
210 * @return true if this is a .gz stream, false otherwise
211 * @since 1.1
212 */
213 public static boolean matches(final byte[] signature, final int length) {
214 return length >= 2 && signature[0] == 31 && signature[1] == -117;
215 }
216
217 private static byte[] readToNull(final DataInput inData) throws IOException {
218 try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
219 int b;
220 while ((b = inData.readUnsignedByte()) != 0) { // NOSONAR
221 bos.write(b);
222 }
223 return bos.toByteArray();
224 }
225 }
226
227 /** Buffer to hold the input data. */
228 private final byte[] buf = new byte[8192];
229
230 /** Amount of data in buf. */
231 private int bufUsed;
232
233 private final BoundedInputStream countingStream;
234
235 /** CRC32 from uncompressed data. */
236 private final CRC32 crc = new CRC32();
237
238 /** True if decompressing multi-member streams. */
239 private final boolean decompressConcatenated;
240
241 /** True once everything has been decompressed. */
242 private boolean endReached;
243
244 private final Charset fileNameCharset;
245
246 /**
247 * Compressed input stream, possibly wrapped in a BufferedInputStream, always wrapped in countingStream above
248 */
249 private final InputStream in;
250
251 /** Decompressor. */
252 private Inflater inflater = new Inflater(true);
253
254 /** Buffer for no-argument read method. */
255 private final byte[] oneByte = new byte[1];
256
257 private GzipParameters parameters;
258
259 private final IOConsumer<GzipCompressorInputStream> onMemberStart;
260
261 private final IOConsumer<GzipCompressorInputStream> onMemberEnd;
262
263 @SuppressWarnings("resource") // caller closes
264 private GzipCompressorInputStream(final Builder builder) throws IOException {
265 countingStream = BoundedInputStream.builder().setInputStream(builder.getInputStream()).get();
266 // Mark support is strictly needed for concatenated files only,
267 // but it's simpler if it is always available.
268 in = countingStream.markSupported() ? countingStream : new BufferedInputStream(countingStream);
269 this.decompressConcatenated = builder.decompressConcatenated;
270 this.fileNameCharset = builder.fileNameCharset;
271 this.onMemberStart = builder.onMemberStart != null ? builder.onMemberStart : NOOP;
272 this.onMemberEnd = builder.onMemberEnd != null ? builder.onMemberEnd : NOOP;
273 init(true);
274 }
275
276 /**
277 * Constructs a new input stream that decompresses gzip-compressed data from the specified input stream.
278 * <p>
279 * This is equivalent to {@code GzipCompressorInputStream(inputStream, false)} and thus will not decompress concatenated .gz files.
280 * </p>
281 *
282 * @param inputStream the InputStream from which this object should be created of
283 * @throws IOException if the stream could not be created
284 */
285 public GzipCompressorInputStream(final InputStream inputStream) throws IOException {
286 this(builder().setInputStream(inputStream));
287 }
288
289 /**
290 * Constructs a new input stream that decompresses gzip-compressed data from the specified input stream.
291 * <p>
292 * If {@code decompressConcatenated} is {@code false}: This decompressor might read more input than it will actually use. If {@code inputStream} supports
293 * {@code mark} and {@code reset}, then the input position will be adjusted so that it is right after the last byte of the compressed stream. If
294 * {@code mark} isn't supported, the input position will be undefined.
295 * </p>
296 *
297 * @param inputStream the InputStream from which this object should be created of
298 * @param decompressConcatenated if true, decompress until the end of the input; if false, stop after the first .gz member
299 * @throws IOException if the stream could not be created
300 * @deprecated Use {@link Builder#get()}.
301 */
302 @Deprecated
303 public GzipCompressorInputStream(final InputStream inputStream, final boolean decompressConcatenated) throws IOException {
304 this(builder().setInputStream(inputStream).setDecompressConcatenated(decompressConcatenated));
305 }
306
307 /**
308 * Closes the input stream (unless it is System.in).
309 *
310 * @since 1.2
311 */
312 @Override
313 public void close() throws IOException {
314 if (inflater != null) {
315 inflater.end();
316 inflater = null;
317 }
318 if (this.in != System.in) {
319 this.in.close();
320 }
321 }
322
323 /**
324 * {@inheritDoc}.
325 *
326 * @since 1.17
327 */
328 @Override
329 public long getCompressedCount() {
330 return countingStream.getCount();
331 }
332
333 /**
334 * Provides the stream's meta data - may change with each stream when decompressing concatenated streams.
335 *
336 * @return the stream's meta data
337 * @since 1.8
338 */
339 public GzipParameters getMetaData() {
340 return parameters;
341 }
342
343 private boolean init(final boolean isFirstMember) throws IOException {
344 if (!isFirstMember && !decompressConcatenated) { // at least one must be true
345 throw new IllegalStateException("Unexpected: isFirstMember and decompressConcatenated are both false.");
346 }
347 // Check the magic bytes without a possibility of EOFException.
348 final int magic0 = in.read();
349 // If end of input was reached after decompressing at least
350 // one .gz member, we have reached the end of the file successfully.
351 if (magic0 == -1 && !isFirstMember) {
352 return false;
353 }
354 if (magic0 != GzipUtils.ID1 || in.read() != GzipUtils.ID2) {
355 throw new IOException(isFirstMember ? "Input is not in the .gz format." : "Unexpected data after a valid .gz stream.");
356 }
357 parameters = new GzipParameters();
358 parameters.setFileNameCharset(fileNameCharset);
359 // Parsing the rest of the header may throw EOFException.
360 final DataInput inData = new DataInputStream(in);
361 final int method = inData.readUnsignedByte();
362 if (method != Deflater.DEFLATED) {
363 throw new IOException("Unsupported compression method " + method + " in the .gz header");
364 }
365 final int flg = inData.readUnsignedByte();
366 if ((flg & GzipUtils.FRESERVED) != 0) {
367 throw new IOException("Reserved flags are set in the .gz header.");
368 }
369 parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4));
370 switch (inData.readUnsignedByte()) { // extra flags
371 case GzipUtils.XFL_MAX_COMPRESSION:
372 parameters.setCompressionLevel(Deflater.BEST_COMPRESSION);
373 break;
374 case GzipUtils.XFL_MAX_SPEED:
375 parameters.setCompressionLevel(Deflater.BEST_SPEED);
376 break;
377 default:
378 parameters.setCompressionLevel(Deflater.DEFAULT_COMPRESSION);
379 break;
380 }
381 parameters.setOperatingSystem(inData.readUnsignedByte());
382 // Extra field
383 if ((flg & GzipUtils.FEXTRA) != 0) {
384 int xlen = inData.readUnsignedByte();
385 xlen |= inData.readUnsignedByte() << 8;
386 final byte[] extra = new byte[xlen];
387 inData.readFully(extra);
388 parameters.setExtraField(ExtraField.fromBytes(extra));
389 }
390 // Original file name
391 if ((flg & GzipUtils.FNAME) != 0) {
392 parameters.setFileName(new String(readToNull(inData), parameters.getFileNameCharset()));
393 }
394 // Comment
395 if ((flg & GzipUtils.FCOMMENT) != 0) {
396 parameters.setComment(new String(readToNull(inData), parameters.getFileNameCharset()));
397 }
398 // Header "CRC16" which is actually a truncated CRC32 (which isn't
399 // as good as real CRC16). I don't know if any encoder implementation
400 // sets this, so it's not worth trying to verify it. GNU gzip 1.4
401 // doesn't support this field, but zlib seems to be able to at least
402 // skip over it.
403 if ((flg & GzipUtils.FHCRC) != 0) {
404 parameters.setHeaderCRC(true);
405 inData.readShort();
406 }
407 // Reset
408 inflater.reset();
409 crc.reset();
410 onMemberStart.accept(this);
411 return true;
412 }
413
414 @Override
415 public int read() throws IOException {
416 return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
417 }
418
419 /**
420 * {@inheritDoc}
421 *
422 * @since 1.1
423 */
424 @Override
425 public int read(final byte[] b, int off, int len) throws IOException {
426 if (len == 0) {
427 return 0;
428 }
429 if (endReached) {
430 return -1;
431 }
432
433 int size = 0;
434
435 while (len > 0) {
436 if (inflater.needsInput()) {
437 // Remember the current position because we may need to
438 // rewind after reading too much input.
439 in.mark(buf.length);
440
441 bufUsed = in.read(buf);
442 if (bufUsed == -1) {
443 throw new EOFException();
444 }
445
446 inflater.setInput(buf, 0, bufUsed);
447 }
448
449 final int ret;
450 try {
451 ret = inflater.inflate(b, off, len);
452 } catch (final DataFormatException e) { // NOSONAR
453 throw new IOException("Gzip-compressed data is corrupt.", e);
454 }
455
456 crc.update(b, off, ret);
457 off += ret;
458 len -= ret;
459 size += ret;
460 count(ret);
461
462 if (inflater.finished()) {
463 // We may have read too many bytes. Rewind the read
464 // position to match the actual amount used.
465 in.reset();
466 final int skipAmount = bufUsed - inflater.getRemaining();
467 if (IOUtils.skip(in, skipAmount) != skipAmount) {
468 throw new IOException();
469 }
470 bufUsed = 0;
471 final DataInput inData = new DataInputStream(in);
472 // CRC32
473 final long trailerCrc = ByteUtils.fromLittleEndian(inData, 4);
474 if (trailerCrc != crc.getValue()) {
475 throw new IOException("Gzip-compressed data is corrupt (CRC32 error).");
476 }
477 // Uncompressed size modulo 2^32, ISIZE in the RFC.
478 final long iSize = ByteUtils.fromLittleEndian(inData, 4);
479 if (iSize != (inflater.getBytesWritten() & 0xffffffffL)) {
480 throw new IOException("Gzip-compressed data is corrupt (uncompressed size mismatch).");
481 }
482 parameters.setTrailerCrc(trailerCrc);
483 parameters.setTrailerISize(iSize);
484 onMemberEnd.accept(this);
485 // See if this is the end of the file.
486 if (!decompressConcatenated || !init(false)) {
487 inflater.end();
488 inflater = null;
489 endReached = true;
490 return size == 0 ? -1 : size;
491 }
492 }
493 }
494
495 return size;
496 }
497 }