View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   https://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.commons.compress.compressors.gzip;
20  
21  import java.io.BufferedInputStream;
22  import java.io.ByteArrayOutputStream;
23  import java.io.DataInput;
24  import java.io.DataInputStream;
25  import java.io.EOFException;
26  import java.io.IOException;
27  import java.io.InputStream;
28  import java.nio.charset.Charset;
29  import java.util.zip.CRC32;
30  import java.util.zip.DataFormatException;
31  import java.util.zip.Deflater;
32  import java.util.zip.Inflater;
33  
34  import org.apache.commons.compress.compressors.CompressorInputStream;
35  import org.apache.commons.compress.utils.ByteUtils;
36  import org.apache.commons.compress.utils.InputStreamStatistics;
37  import org.apache.commons.io.IOUtils;
38  import org.apache.commons.io.build.AbstractOrigin;
39  import org.apache.commons.io.build.AbstractStreamBuilder;
40  import org.apache.commons.io.function.IOConsumer;
41  import org.apache.commons.io.input.BoundedInputStream;
42  
43  /**
44   * Input stream that decompresses GZIP (.gz) files.
45   *
46   * <p>
47   * This supports decompressing concatenated GZIP files which is important when decompressing standalone GZIP files.
48   * </p>
49   * <p>
50   * Instead of using {@code java.util.zip.GZIPInputStream}, this class has its own GZIP member decoder. Internally, decompression is done using
51   * {@link java.util.zip.Inflater}.
52   * </p>
53   * <p>
54   * If you use the constructor {@code GzipCompressorInputStream(in)}, {@code Builder.setDecompressConcatenated(false)}, or
55   * {@code GzipCompressorInputStream(in, false)}, then {@link #read} will return -1 as soon as the first encoded GZIP member has been completely read. In this
56   * case, if the underlying input stream supports {@link InputStream#mark mark()} and {@link InputStream#reset reset()}, then it will be left positioned just
57   * after the end of the encoded GZIP member; otherwise, some indeterminate number of extra bytes following the encoded GZIP member will have been consumed and
58   * discarded.
59   * </p>
60   * <p>
61   * If you use the {@code Builder.setDecompressConcatenated(true)} or {@code GzipCompressorInputStream(in, true)} then {@link #read} will return -1 only after
62   * the entire input stream has been exhausted; any bytes that follow an encoded GZIP member must constitute a new encoded GZIP member, otherwise an
63   * {@link IOException} is thrown. The data read from a stream constructed this way will consist of the concatenated data of all of the encoded GZIP members in
64   * order.
65   * </p>
66   * <p>
67   * To build an instance, use {@link Builder}.
68   * </p>
69   *
70   * @see Builder
71   * @see <a href="https://datatracker.ietf.org/doc/html/rfc1952">RFC 1952 GZIP File Format Specification</a>
72   */
73  public class GzipCompressorInputStream extends CompressorInputStream implements InputStreamStatistics {
74  
75      // @formatter:off
76      /**
77       * Builds a new {@link GzipCompressorInputStream}.
78       *
79       * <p>
80       * For example:
81       * </p>
82       * <pre>{@code
83       * GzipCompressorInputStream s = GzipCompressorInputStream.builder()
84       *   .setPath(path)
85       *   .setFileNameCharset(StandardCharsets.ISO_8859_1)
86       *   .get();}
87       * </pre>
88       *
89       * @see #get()
90       * @since 1.28.0
91       */
92      // @formatter:on
93      public static class Builder extends AbstractStreamBuilder<GzipCompressorInputStream, Builder> {
94  
95          /** True if decompressing multi-member streams. */
96          private boolean decompressConcatenated;
97  
98          private Charset fileNameCharset = GzipUtils.GZIP_ENCODING;
99  
100         private IOConsumer<GzipCompressorInputStream> onMemberStart;
101 
102         private IOConsumer<GzipCompressorInputStream> onMemberEnd;
103 
104         /**
105          * Constructs a new builder of {@link GzipCompressorInputStream}.
106          */
107         public Builder() {
108             // empty
109         }
110 
111         /**
112          * Builds a new {@link GzipCompressorInputStream}.
113          * <p>
114          * You must set input that supports {@link InputStream}, otherwise, this method throws an exception.
115          * </p>
116          *
117          * @return a new instance.
118          * @throws IllegalStateException         if the {@code origin} is {@code null}.
119          * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
120          * @see AbstractOrigin#getInputStream(java.nio.file.OpenOption...)
121          */
122         @Override
123         public GzipCompressorInputStream get() throws IOException {
124             return new GzipCompressorInputStream(this);
125         }
126 
127         /**
128          * Sets whether we should allow decompressing multiple members.
129          *
130          * @param decompressConcatenated whether we should allow decompressing multiple members.
131          * @return this instance.
132          */
133         public Builder setDecompressConcatenated(final boolean decompressConcatenated) {
134             this.decompressConcatenated = decompressConcatenated;
135             return this;
136         }
137 
138         /**
139          * Sets the Charset to use for writing file names and comments, where null maps to {@link GzipUtils#GZIP_ENCODING}.
140          * <p>
141          * <em>Setting a value other than {@link GzipUtils#GZIP_ENCODING} is not compliant with the <a href="https://datatracker.ietf.org/doc/html/rfc1952">RFC
142          * 1952 GZIP File Format Specification</a></em>. Use at your own risk of interoperability issues.
143          * </p>
144          * <p>
145          * The default value is {@link GzipUtils#GZIP_ENCODING}.
146          * </p>
147          *
148          * @param fileNameCharset the Charset to use for writing file names and comments, null maps to {@link GzipUtils#GZIP_ENCODING}.
149          * @return this instance.
150          */
151         public Builder setFileNameCharset(final Charset fileNameCharset) {
152             this.fileNameCharset = fileNameCharset;
153             return this;
154         }
155 
156         /**
157          * Sets the consumer called when a member <em>trailer</em> is parsed.
158          * <p>
159          * When a member <em>header</em> is parsed, all {@link GzipParameters} values are initialized except {@code trailerCrc} and {@code trailerISize}.
160          * </p>
161          * <p>
162          * When a member <em>trailer</em> is parsed, the {@link GzipParameters} values {@code trailerCrc} and {@code trailerISize} are set.
163          * </p>
164          *
165          * @param onMemberEnd The consumer.
166          * @return this instance.
167          * @see GzipCompressorInputStream#getMetaData()
168          */
169         public Builder setOnMemberEnd(final IOConsumer<GzipCompressorInputStream> onMemberEnd) {
170             this.onMemberEnd = onMemberEnd;
171             return this;
172         }
173 
174         /**
175          * Sets the consumer called when a member <em>header</em> is parsed.
176          * <p>
177          * When a member <em>header</em> is parsed, all {@link GzipParameters} values are initialized except {@code trailerCrc} and {@code trailerISize}.
178          * </p>
179          * <p>
180          * When a member <em>trailer</em> is parsed, the {@link GzipParameters} values {@code trailerCrc} and {@code trailerISize} are set.
181          * </p>
182          *
183          * @param onMemberStart The consumer.
184          * @return this instance.
185          * @see GzipCompressorInputStream#getMetaData()
186          */
187         public Builder setOnMemberStart(final IOConsumer<GzipCompressorInputStream> onMemberStart) {
188             this.onMemberStart = onMemberStart;
189             return this;
190         }
191     }
192 
193     private static final IOConsumer<GzipCompressorInputStream> NOOP = IOConsumer.noop();
194 
195     /**
196      * Constructs a new builder of {@link GzipCompressorInputStream}.
197      *
198      * @return a new builder of {@link GzipCompressorInputStream}.
199      * @since 1.28.0
200      */
201     public static Builder builder() {
202         return new Builder();
203     }
204 
205     /**
206      * Checks if the signature matches what is expected for a .gz file.
207      *
208      * @param signature the bytes to check
209      * @param length    the number of bytes to check
210      * @return true if this is a .gz stream, false otherwise
211      * @since 1.1
212      */
213     public static boolean matches(final byte[] signature, final int length) {
214         return length >= 2 && signature[0] == 31 && signature[1] == -117;
215     }
216 
217     private static byte[] readToNull(final DataInput inData) throws IOException {
218         try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
219             int b;
220             while ((b = inData.readUnsignedByte()) != 0) { // NOSONAR
221                 bos.write(b);
222             }
223             return bos.toByteArray();
224         }
225     }
226 
227     /** Buffer to hold the input data. */
228     private final byte[] buf = new byte[8192];
229 
230     /** Amount of data in buf. */
231     private int bufUsed;
232 
233     private final BoundedInputStream countingStream;
234 
235     /** CRC32 from uncompressed data. */
236     private final CRC32 crc = new CRC32();
237 
238     /** True if decompressing multi-member streams. */
239     private final boolean decompressConcatenated;
240 
241     /** True once everything has been decompressed. */
242     private boolean endReached;
243 
244     private final Charset fileNameCharset;
245 
246     /**
247      * Compressed input stream, possibly wrapped in a BufferedInputStream, always wrapped in countingStream above
248      */
249     private final InputStream in;
250 
251     /** Decompressor. */
252     private Inflater inflater = new Inflater(true);
253 
254     /** Buffer for no-argument read method. */
255     private final byte[] oneByte = new byte[1];
256 
257     private GzipParameters parameters;
258 
259     private final IOConsumer<GzipCompressorInputStream> onMemberStart;
260 
261     private final IOConsumer<GzipCompressorInputStream> onMemberEnd;
262 
263     @SuppressWarnings("resource") // caller closes
264     private GzipCompressorInputStream(final Builder builder) throws IOException {
265         countingStream = BoundedInputStream.builder().setInputStream(builder.getInputStream()).get();
266         // Mark support is strictly needed for concatenated files only,
267         // but it's simpler if it is always available.
268         in = countingStream.markSupported() ? countingStream : new BufferedInputStream(countingStream);
269         this.decompressConcatenated = builder.decompressConcatenated;
270         this.fileNameCharset = builder.fileNameCharset;
271         this.onMemberStart = builder.onMemberStart != null ? builder.onMemberStart : NOOP;
272         this.onMemberEnd = builder.onMemberEnd != null ? builder.onMemberEnd : NOOP;
273         init(true);
274     }
275 
276     /**
277      * Constructs a new input stream that decompresses gzip-compressed data from the specified input stream.
278      * <p>
279      * This is equivalent to {@code GzipCompressorInputStream(inputStream, false)} and thus will not decompress concatenated .gz files.
280      * </p>
281      *
282      * @param inputStream the InputStream from which this object should be created of
283      * @throws IOException if the stream could not be created
284      */
285     public GzipCompressorInputStream(final InputStream inputStream) throws IOException {
286         this(builder().setInputStream(inputStream));
287     }
288 
289     /**
290      * Constructs a new input stream that decompresses gzip-compressed data from the specified input stream.
291      * <p>
292      * If {@code decompressConcatenated} is {@code false}: This decompressor might read more input than it will actually use. If {@code inputStream} supports
293      * {@code mark} and {@code reset}, then the input position will be adjusted so that it is right after the last byte of the compressed stream. If
294      * {@code mark} isn't supported, the input position will be undefined.
295      * </p>
296      *
297      * @param inputStream            the InputStream from which this object should be created of
298      * @param decompressConcatenated if true, decompress until the end of the input; if false, stop after the first .gz member
299      * @throws IOException if the stream could not be created
300      * @deprecated Use {@link Builder#get()}.
301      */
302     @Deprecated
303     public GzipCompressorInputStream(final InputStream inputStream, final boolean decompressConcatenated) throws IOException {
304         this(builder().setInputStream(inputStream).setDecompressConcatenated(decompressConcatenated));
305     }
306 
307     /**
308      * Closes the input stream (unless it is System.in).
309      *
310      * @since 1.2
311      */
312     @Override
313     public void close() throws IOException {
314         if (inflater != null) {
315             inflater.end();
316             inflater = null;
317         }
318         if (this.in != System.in) {
319             this.in.close();
320         }
321     }
322 
323     /**
324      * {@inheritDoc}.
325      *
326      * @since 1.17
327      */
328     @Override
329     public long getCompressedCount() {
330         return countingStream.getCount();
331     }
332 
333     /**
334      * Provides the stream's meta data - may change with each stream when decompressing concatenated streams.
335      *
336      * @return the stream's meta data
337      * @since 1.8
338      */
339     public GzipParameters getMetaData() {
340         return parameters;
341     }
342 
343     private boolean init(final boolean isFirstMember) throws IOException {
344         if (!isFirstMember && !decompressConcatenated) { // at least one must be true
345             throw new IllegalStateException("Unexpected: isFirstMember and decompressConcatenated are both false.");
346         }
347         // Check the magic bytes without a possibility of EOFException.
348         final int magic0 = in.read();
349         // If end of input was reached after decompressing at least
350         // one .gz member, we have reached the end of the file successfully.
351         if (magic0 == -1 && !isFirstMember) {
352             return false;
353         }
354         if (magic0 != GzipUtils.ID1 || in.read() != GzipUtils.ID2) {
355             throw new IOException(isFirstMember ? "Input is not in the .gz format." : "Unexpected data after a valid .gz stream.");
356         }
357         parameters = new GzipParameters();
358         parameters.setFileNameCharset(fileNameCharset);
359         // Parsing the rest of the header may throw EOFException.
360         final DataInput inData = new DataInputStream(in);
361         final int method = inData.readUnsignedByte();
362         if (method != Deflater.DEFLATED) {
363             throw new IOException("Unsupported compression method " + method + " in the .gz header");
364         }
365         final int flg = inData.readUnsignedByte();
366         if ((flg & GzipUtils.FRESERVED) != 0) {
367             throw new IOException("Reserved flags are set in the .gz header.");
368         }
369         parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4));
370         switch (inData.readUnsignedByte()) { // extra flags
371         case GzipUtils.XFL_MAX_COMPRESSION:
372             parameters.setCompressionLevel(Deflater.BEST_COMPRESSION);
373             break;
374         case GzipUtils.XFL_MAX_SPEED:
375             parameters.setCompressionLevel(Deflater.BEST_SPEED);
376             break;
377         default:
378             parameters.setCompressionLevel(Deflater.DEFAULT_COMPRESSION);
379             break;
380         }
381         parameters.setOperatingSystem(inData.readUnsignedByte());
382         // Extra field
383         if ((flg & GzipUtils.FEXTRA) != 0) {
384             int xlen = inData.readUnsignedByte();
385             xlen |= inData.readUnsignedByte() << 8;
386             final byte[] extra = new byte[xlen];
387             inData.readFully(extra);
388             parameters.setExtraField(ExtraField.fromBytes(extra));
389         }
390         // Original file name
391         if ((flg & GzipUtils.FNAME) != 0) {
392             parameters.setFileName(new String(readToNull(inData), parameters.getFileNameCharset()));
393         }
394         // Comment
395         if ((flg & GzipUtils.FCOMMENT) != 0) {
396             parameters.setComment(new String(readToNull(inData), parameters.getFileNameCharset()));
397         }
398         // Header "CRC16" which is actually a truncated CRC32 (which isn't
399         // as good as real CRC16). I don't know if any encoder implementation
400         // sets this, so it's not worth trying to verify it. GNU gzip 1.4
401         // doesn't support this field, but zlib seems to be able to at least
402         // skip over it.
403         if ((flg & GzipUtils.FHCRC) != 0) {
404             parameters.setHeaderCRC(true);
405             inData.readShort();
406         }
407         // Reset
408         inflater.reset();
409         crc.reset();
410         onMemberStart.accept(this);
411         return true;
412     }
413 
414     @Override
415     public int read() throws IOException {
416         return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
417     }
418 
419     /**
420      * {@inheritDoc}
421      *
422      * @since 1.1
423      */
424     @Override
425     public int read(final byte[] b, int off, int len) throws IOException {
426         if (len == 0) {
427             return 0;
428         }
429         if (endReached) {
430             return -1;
431         }
432 
433         int size = 0;
434 
435         while (len > 0) {
436             if (inflater.needsInput()) {
437                 // Remember the current position because we may need to
438                 // rewind after reading too much input.
439                 in.mark(buf.length);
440 
441                 bufUsed = in.read(buf);
442                 if (bufUsed == -1) {
443                     throw new EOFException();
444                 }
445 
446                 inflater.setInput(buf, 0, bufUsed);
447             }
448 
449             final int ret;
450             try {
451                 ret = inflater.inflate(b, off, len);
452             } catch (final DataFormatException e) { // NOSONAR
453                 throw new IOException("Gzip-compressed data is corrupt.", e);
454             }
455 
456             crc.update(b, off, ret);
457             off += ret;
458             len -= ret;
459             size += ret;
460             count(ret);
461 
462             if (inflater.finished()) {
463                 // We may have read too many bytes. Rewind the read
464                 // position to match the actual amount used.
465                 in.reset();
466                 final int skipAmount = bufUsed - inflater.getRemaining();
467                 if (IOUtils.skip(in, skipAmount) != skipAmount) {
468                     throw new IOException();
469                 }
470                 bufUsed = 0;
471                 final DataInput inData = new DataInputStream(in);
472                 // CRC32
473                 final long trailerCrc = ByteUtils.fromLittleEndian(inData, 4);
474                 if (trailerCrc != crc.getValue()) {
475                     throw new IOException("Gzip-compressed data is corrupt (CRC32 error).");
476                 }
477                 // Uncompressed size modulo 2^32, ISIZE in the RFC.
478                 final long iSize = ByteUtils.fromLittleEndian(inData, 4);
479                 if (iSize != (inflater.getBytesWritten() & 0xffffffffL)) {
480                     throw new IOException("Gzip-compressed data is corrupt (uncompressed size mismatch).");
481                 }
482                 parameters.setTrailerCrc(trailerCrc);
483                 parameters.setTrailerISize(iSize);
484                 onMemberEnd.accept(this);
485                 // See if this is the end of the file.
486                 if (!decompressConcatenated || !init(false)) {
487                     inflater.end();
488                     inflater = null;
489                     endReached = true;
490                     return size == 0 ? -1 : size;
491                 }
492             }
493         }
494 
495         return size;
496     }
497 }