View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   * http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.commons.compress.compressors.gzip;
20  
21  import java.io.BufferedInputStream;
22  import java.io.ByteArrayOutputStream;
23  import java.io.DataInput;
24  import java.io.DataInputStream;
25  import java.io.EOFException;
26  import java.io.IOException;
27  import java.io.InputStream;
28  import java.util.zip.CRC32;
29  import java.util.zip.DataFormatException;
30  import java.util.zip.Deflater;
31  import java.util.zip.Inflater;
32  
33  import org.apache.commons.compress.compressors.CompressorInputStream;
34  import org.apache.commons.compress.utils.ByteUtils;
35  import org.apache.commons.compress.utils.InputStreamStatistics;
36  import org.apache.commons.io.input.CountingInputStream;
37  
38  /**
39   * Input stream that decompresses .gz files.
40   *
41   * <p>
42   * This supports decompressing concatenated .gz files which is important when decompressing standalone .gz files.
43   * </p>
44   *
45   * <p>
46   * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz files: it stops after the first member and silently ignores the rest. It doesn't
47   * leave the read position to point to the beginning of the next member, which makes it difficult workaround the lack of concatenation support.
48   * </p>
49   *
50   * <p>
51   * Instead of using {@code GZIPInputStream}, this class has its own .gz container format decoder. The actual decompression is done with
52   * {@link java.util.zip.Inflater}.
53   * </p>
54   *
55   * <p>
56   * If you use the constructor {@code GzipCompressorInputStream(in)} or {@code GzipCompressorInputStream(in, false)} with some {@code
57   * InputStream} {@code in} then {@link #read} will return -1 as soon as the first internal member has been read completely. The stream {@code in} will be
58   * positioned at the start of the second gzip member if there is one.
59   * </p>
60   *
61   * <p>
62   * If you use the constructor {@code GzipCompressorInputStream(in,
63   * true)} with some {@code InputStream} {@code in} then {@link #read} will return -1 once the stream {@code in} has been exhausted. The data read from a stream
64   * constructed this way will consist of the concatenated data of all gzip members contained inside {@code
65   * in}.
66   * </p>
67   *
68   * @see "https://tools.ietf.org/html/rfc1952"
69   */
70  public class GzipCompressorInputStream extends CompressorInputStream implements InputStreamStatistics {
71  
72      // Header flags
73      // private static final int FTEXT = 0x01; // Uninteresting for us
74      private static final int FHCRC = 0x02;
75      private static final int FEXTRA = 0x04;
76      private static final int FNAME = 0x08;
77      private static final int FCOMMENT = 0x10;
78      private static final int FRESERVED = 0xE0;
79  
80      /**
81       * Checks if the signature matches what is expected for a .gz file.
82       *
83       * @param signature the bytes to check
84       * @param length    the number of bytes to check
85       * @return true if this is a .gz stream, false otherwise
86       *
87       * @since 1.1
88       */
89      public static boolean matches(final byte[] signature, final int length) {
90          return length >= 2 && signature[0] == 31 && signature[1] == -117;
91      }
92  
93      private static byte[] readToNull(final DataInput inData) throws IOException {
94          try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
95              int b;
96              while ((b = inData.readUnsignedByte()) != 0) { // NOPMD NOSONAR
97                  bos.write(b);
98              }
99              return bos.toByteArray();
100         }
101     }
102 
103     private final CountingInputStream countingStream;
104 
105     // Compressed input stream, possibly wrapped in a
106     // BufferedInputStream, always wrapped in countingStream above
107     private final InputStream in;
108 
109     // True if decompressing multi member streams.
110     private final boolean decompressConcatenated;
111 
112     // Buffer to hold the input data
113     private final byte[] buf = new byte[8192];
114 
115     // Amount of data in buf.
116     private int bufUsed;
117 
118     // Decompressor
119     private Inflater inf = new Inflater(true);
120 
121     // CRC32 from uncompressed data
122     private final CRC32 crc = new CRC32();
123 
124     // True once everything has been decompressed
125     private boolean endReached;
126 
127     // used in no-arg read method
128     private final byte[] oneByte = new byte[1];
129 
130     private final GzipParameters parameters = new GzipParameters();
131 
132     /**
133      * Constructs a new input stream that decompresses gzip-compressed data from the specified input stream.
134      * <p>
135      * This is equivalent to {@code GzipCompressorInputStream(inputStream, false)} and thus will not decompress concatenated .gz files.
136      *
137      * @param inputStream the InputStream from which this object should be created of
138      *
139      * @throws IOException if the stream could not be created
140      */
141     public GzipCompressorInputStream(final InputStream inputStream) throws IOException {
142         this(inputStream, false);
143     }
144 
145     /**
146      * Constructs a new input stream that decompresses gzip-compressed data from the specified input stream.
147      * <p>
148      * If {@code decompressConcatenated} is {@code false}: This decompressor might read more input than it will actually use. If {@code inputStream} supports
149      * {@code mark} and {@code reset}, then the input position will be adjusted so that it is right after the last byte of the compressed stream. If
150      * {@code mark} isn't supported, the input position will be undefined.
151      *
152      * @param inputStream            the InputStream from which this object should be created of
153      * @param decompressConcatenated if true, decompress until the end of the input; if false, stop after the first .gz member
154      *
155      * @throws IOException if the stream could not be created
156      */
157     public GzipCompressorInputStream(final InputStream inputStream, final boolean decompressConcatenated) throws IOException {
158         countingStream = new CountingInputStream(inputStream);
159         // Mark support is strictly needed for concatenated files only,
160         // but it's simpler if it is always available.
161         if (countingStream.markSupported()) {
162             in = countingStream;
163         } else {
164             in = new BufferedInputStream(countingStream);
165         }
166 
167         this.decompressConcatenated = decompressConcatenated;
168         init(true);
169     }
170 
171     /**
172      * Closes the input stream (unless it is System.in).
173      *
174      * @since 1.2
175      */
176     @Override
177     public void close() throws IOException {
178         if (inf != null) {
179             inf.end();
180             inf = null;
181         }
182 
183         if (this.in != System.in) {
184             this.in.close();
185         }
186     }
187 
188     /**
189      * @since 1.17
190      */
191     @Override
192     public long getCompressedCount() {
193         return countingStream.getByteCount();
194     }
195 
196     /**
197      * Provides the stream's meta data - may change with each stream when decompressing concatenated streams.
198      *
199      * @return the stream's meta data
200      * @since 1.8
201      */
202     public GzipParameters getMetaData() {
203         return parameters;
204     }
205 
206     private boolean init(final boolean isFirstMember) throws IOException {
207         assert isFirstMember || decompressConcatenated;
208 
209         // Check the magic bytes without a possibility of EOFException.
210         final int magic0 = in.read();
211 
212         // If end of input was reached after decompressing at least
213         // one .gz member, we have reached the end of the file successfully.
214         if (magic0 == -1 && !isFirstMember) {
215             return false;
216         }
217 
218         if (magic0 != 31 || in.read() != 139) {
219             throw new IOException(isFirstMember ? "Input is not in the .gz format" : "Garbage after a valid .gz stream");
220         }
221 
222         // Parsing the rest of the header may throw EOFException.
223         final DataInput inData = new DataInputStream(in);
224         final int method = inData.readUnsignedByte();
225         if (method != Deflater.DEFLATED) {
226             throw new IOException("Unsupported compression method " + method + " in the .gz header");
227         }
228 
229         final int flg = inData.readUnsignedByte();
230         if ((flg & FRESERVED) != 0) {
231             throw new IOException("Reserved flags are set in the .gz header");
232         }
233 
234         parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4) * 1000);
235         switch (inData.readUnsignedByte()) { // extra flags
236         case 2:
237             parameters.setCompressionLevel(Deflater.BEST_COMPRESSION);
238             break;
239         case 4:
240             parameters.setCompressionLevel(Deflater.BEST_SPEED);
241             break;
242         default:
243             // ignored for now
244             break;
245         }
246         parameters.setOperatingSystem(inData.readUnsignedByte());
247 
248         // Extra field, ignored
249         if ((flg & FEXTRA) != 0) {
250             int xlen = inData.readUnsignedByte();
251             xlen |= inData.readUnsignedByte() << 8;
252 
253             // This isn't as efficient as calling in.skip would be,
254             // but it's lazier to handle unexpected end of input this way.
255             // Most files don't have an extra field anyway.
256             while (xlen-- > 0) {
257                 inData.readUnsignedByte();
258             }
259         }
260 
261         // Original file name
262         if ((flg & FNAME) != 0) {
263             parameters.setFileName(new String(readToNull(inData), GzipUtils.GZIP_ENCODING));
264         }
265 
266         // Comment
267         if ((flg & FCOMMENT) != 0) {
268             parameters.setComment(new String(readToNull(inData), GzipUtils.GZIP_ENCODING));
269         }
270 
271         // Header "CRC16" which is actually a truncated CRC32 (which isn't
272         // as good as real CRC16). I don't know if any encoder implementation
273         // sets this, so it's not worth trying to verify it. GNU gzip 1.4
274         // doesn't support this field, but zlib seems to be able to at least
275         // skip over it.
276         if ((flg & FHCRC) != 0) {
277             inData.readShort();
278         }
279 
280         // Reset
281         inf.reset();
282         crc.reset();
283 
284         return true;
285     }
286 
287     @Override
288     public int read() throws IOException {
289         return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
290     }
291 
292     /**
293      * {@inheritDoc}
294      *
295      * @since 1.1
296      */
297     @Override
298     public int read(final byte[] b, int off, int len) throws IOException {
299         if (len == 0) {
300             return 0;
301         }
302         if (endReached) {
303             return -1;
304         }
305 
306         int size = 0;
307 
308         while (len > 0) {
309             if (inf.needsInput()) {
310                 // Remember the current position because we may need to
311                 // rewind after reading too much input.
312                 in.mark(buf.length);
313 
314                 bufUsed = in.read(buf);
315                 if (bufUsed == -1) {
316                     throw new EOFException();
317                 }
318 
319                 inf.setInput(buf, 0, bufUsed);
320             }
321 
322             final int ret;
323             try {
324                 ret = inf.inflate(b, off, len);
325             } catch (final DataFormatException e) { // NOSONAR
326                 throw new IOException("Gzip-compressed data is corrupt");
327             }
328 
329             crc.update(b, off, ret);
330             off += ret;
331             len -= ret;
332             size += ret;
333             count(ret);
334 
335             if (inf.finished()) {
336                 // We may have read too many bytes. Rewind the read
337                 // position to match the actual amount used.
338                 in.reset();
339 
340                 final int skipAmount = bufUsed - inf.getRemaining();
341                 if (org.apache.commons.io.IOUtils.skip(in, skipAmount) != skipAmount) {
342                     throw new IOException();
343                 }
344 
345                 bufUsed = 0;
346 
347                 final DataInput inData = new DataInputStream(in);
348 
349                 // CRC32
350                 final long crcStored = ByteUtils.fromLittleEndian(inData, 4);
351 
352                 if (crcStored != crc.getValue()) {
353                     throw new IOException("Gzip-compressed data is corrupt " + "(CRC32 error)");
354                 }
355 
356                 // Uncompressed size modulo 2^32 (ISIZE in the spec)
357                 final long isize = ByteUtils.fromLittleEndian(inData, 4);
358 
359                 if (isize != (inf.getBytesWritten() & 0xffffffffL)) {
360                     throw new IOException("Gzip-compressed data is corrupt" + "(uncompressed size mismatch)");
361                 }
362 
363                 // See if this is the end of the file.
364                 if (!decompressConcatenated || !init(false)) {
365                     inf.end();
366                     inf = null;
367                     endReached = true;
368                     return size == 0 ? -1 : size;
369                 }
370             }
371         }
372 
373         return size;
374     }
375 }