View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   * http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.commons.compress.compressors.gzip;
20  
21  import java.io.ByteArrayOutputStream;
22  import java.io.IOException;
23  import java.io.EOFException;
24  import java.io.InputStream;
25  import java.io.DataInput;
26  import java.io.DataInputStream;
27  import java.io.BufferedInputStream;
28  import java.util.zip.DataFormatException;
29  import java.util.zip.Deflater;
30  import java.util.zip.Inflater;
31  import java.util.zip.CRC32;
32  
33  import org.apache.commons.compress.compressors.CompressorInputStream;
34  import org.apache.commons.compress.utils.ByteUtils;
35  import org.apache.commons.compress.utils.CharsetNames;
36  import org.apache.commons.compress.utils.CountingInputStream;
37  import org.apache.commons.compress.utils.IOUtils;
38  import org.apache.commons.compress.utils.InputStreamStatistics;
39  
40  /**
41   * Input stream that decompresses .gz files.
42   *
43   * <p>This supports decompressing concatenated .gz files which is important
44   * when decompressing standalone .gz files.</p>
45   *
46   * <p>
47   * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz
48   * files: it stops after the first member and silently ignores the rest.
49   * It doesn't leave the read position to point to the beginning of the next
50   * member, which makes it difficult workaround the lack of concatenation
51   * support.
52   * </p>
53   *
54   * <p>
55   * Instead of using <code>GZIPInputStream</code>, this class has its own .gz
56   * container format decoder. The actual decompression is done with
57   * {@link java.util.zip.Inflater}.
58   * </p>
59   *
60   * <p>If you use the constructor {@code GzipCompressorInputStream(in)}
61   * or {@code GzipCompressorInputStream(in, false)} with some {@code
62   * InputStream} {@code in} then {@link #read} will return -1 as soon
63   * as the first internal member has been read completely. The stream
64   * {@code in} will be positioned at the start of the second gzip
65   * member if there is one.</p>
66   *
67   * <p>If you use the constructor {@code GzipCompressorInputStream(in,
68   * true)} with some {@code InputStream} {@code in} then {@link #read}
69   * will return -1 once the stream {@code in} has been exhausted. The
70   * data read from a stream constructed this way will consist of the
71   * concatenated data of all gzip members contained inside {@code
72   * in}.</p>
73   *
74   * @see "https://tools.ietf.org/html/rfc1952"
75   */
76  public class GzipCompressorInputStream extends CompressorInputStream
77      implements InputStreamStatistics {
78  
79      // Header flags
80      // private static final int FTEXT = 0x01; // Uninteresting for us
81      private static final int FHCRC = 0x02;
82      private static final int FEXTRA = 0x04;
83      private static final int FNAME = 0x08;
84      private static final int FCOMMENT = 0x10;
85      private static final int FRESERVED = 0xE0;
86  
87      private final CountingInputStream countingStream;
88  
89      // Compressed input stream, possibly wrapped in a
90      // BufferedInputStream, always wrapped in countingStream above
91      private final InputStream in;
92  
93      // True if decompressing multi member streams.
94      private final boolean decompressConcatenated;
95  
96      // Buffer to hold the input data
97      private final byte[] buf = new byte[8192];
98  
99      // Amount of data in buf.
100     private int bufUsed;
101 
102     // Decompressor
103     private Inflater inf = new Inflater(true);
104 
105     // CRC32 from uncompressed data
106     private final CRC32 crc = new CRC32();
107 
108     // True once everything has been decompressed
109     private boolean endReached = false;
110 
111     // used in no-arg read method
112     private final byte[] oneByte = new byte[1];
113 
114     private final GzipParameters parameters = new GzipParameters();
115 
116     /**
117      * Constructs a new input stream that decompresses gzip-compressed data
118      * from the specified input stream.
119      * <p>
120      * This is equivalent to
121      * <code>GzipCompressorInputStream(inputStream, false)</code> and thus
122      * will not decompress concatenated .gz files.
123      *
124      * @param inputStream  the InputStream from which this object should
125      *                     be created of
126      *
127      * @throws IOException if the stream could not be created
128      */
129     public GzipCompressorInputStream(final InputStream inputStream)
130             throws IOException {
131         this(inputStream, false);
132     }
133 
134     /**
135      * Constructs a new input stream that decompresses gzip-compressed data
136      * from the specified input stream.
137      * <p>
138      * If <code>decompressConcatenated</code> is {@code false}:
139      * This decompressor might read more input than it will actually use.
140      * If <code>inputStream</code> supports <code>mark</code> and
141      * <code>reset</code>, then the input position will be adjusted
142      * so that it is right after the last byte of the compressed stream.
143      * If <code>mark</code> isn't supported, the input position will be
144      * undefined.
145      *
146      * @param inputStream  the InputStream from which this object should
147      *                     be created of
148      * @param decompressConcatenated
149      *                     if true, decompress until the end of the input;
150      *                     if false, stop after the first .gz member
151      *
152      * @throws IOException if the stream could not be created
153      */
154     public GzipCompressorInputStream(final InputStream inputStream,
155                                      final boolean decompressConcatenated)
156             throws IOException {
157         countingStream = new CountingInputStream(inputStream);
158         // Mark support is strictly needed for concatenated files only,
159         // but it's simpler if it is always available.
160         if (countingStream.markSupported()) {
161             in = countingStream;
162         } else {
163             in = new BufferedInputStream(countingStream);
164         }
165 
166         this.decompressConcatenated = decompressConcatenated;
167         init(true);
168     }
169 
170     /**
171      * Provides the stream's meta data - may change with each stream
172      * when decompressing concatenated streams.
173      * @return the stream's meta data
174      * @since 1.8
175      */
176     public GzipParameters getMetaData() {
177         return parameters;
178     }
179 
180     private boolean init(final boolean isFirstMember) throws IOException {
181         assert isFirstMember || decompressConcatenated;
182 
183         // Check the magic bytes without a possibility of EOFException.
184         final int magic0 = in.read();
185         final int magic1 = in.read();
186 
187         // If end of input was reached after decompressing at least
188         // one .gz member, we have reached the end of the file successfully.
189         if (magic0 == -1 && !isFirstMember) {
190             return false;
191         }
192 
193         if (magic0 != 31 || magic1 != 139) {
194             throw new IOException(isFirstMember
195                                   ? "Input is not in the .gz format"
196                                   : "Garbage after a valid .gz stream");
197         }
198 
199         // Parsing the rest of the header may throw EOFException.
200         final DataInput inData = new DataInputStream(in);
201         final int method = inData.readUnsignedByte();
202         if (method != Deflater.DEFLATED) {
203             throw new IOException("Unsupported compression method "
204                                   + method + " in the .gz header");
205         }
206 
207         final int flg = inData.readUnsignedByte();
208         if ((flg & FRESERVED) != 0) {
209             throw new IOException(
210                     "Reserved flags are set in the .gz header");
211         }
212 
213         parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4) * 1000);
214         switch (inData.readUnsignedByte()) { // extra flags
215         case 2:
216             parameters.setCompressionLevel(Deflater.BEST_COMPRESSION);
217             break;
218         case 4:
219             parameters.setCompressionLevel(Deflater.BEST_SPEED);
220             break;
221         default:
222             // ignored for now
223             break;
224         }
225         parameters.setOperatingSystem(inData.readUnsignedByte());
226 
227         // Extra field, ignored
228         if ((flg & FEXTRA) != 0) {
229             int xlen = inData.readUnsignedByte();
230             xlen |= inData.readUnsignedByte() << 8;
231 
232             // This isn't as efficient as calling in.skip would be,
233             // but it's lazier to handle unexpected end of input this way.
234             // Most files don't have an extra field anyway.
235             while (xlen-- > 0) {
236                 inData.readUnsignedByte();
237             }
238         }
239 
240         // Original file name
241         if ((flg & FNAME) != 0) {
242             parameters.setFilename(new String(readToNull(inData),
243                                               CharsetNames.ISO_8859_1));
244         }
245 
246         // Comment
247         if ((flg & FCOMMENT) != 0) {
248             parameters.setComment(new String(readToNull(inData),
249                                              CharsetNames.ISO_8859_1));
250         }
251 
252         // Header "CRC16" which is actually a truncated CRC32 (which isn't
253         // as good as real CRC16). I don't know if any encoder implementation
254         // sets this, so it's not worth trying to verify it. GNU gzip 1.4
255         // doesn't support this field, but zlib seems to be able to at least
256         // skip over it.
257         if ((flg & FHCRC) != 0) {
258             inData.readShort();
259         }
260 
261         // Reset
262         inf.reset();
263         crc.reset();
264 
265         return true;
266     }
267 
268     private static byte[] readToNull(final DataInput inData) throws IOException {
269         final ByteArrayOutputStream bos = new ByteArrayOutputStream();
270         int b = 0;
271         while ((b = inData.readUnsignedByte()) != 0x00) { // NOPMD
272             bos.write(b);
273         }
274         return bos.toByteArray();
275     }
276 
277     @Override
278     public int read() throws IOException {
279         return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
280     }
281 
282     /**
283      * {@inheritDoc}
284      *
285      * @since 1.1
286      */
287     @Override
288     public int read(final byte[] b, int off, int len) throws IOException {
289         if (endReached) {
290             return -1;
291         }
292 
293         int size = 0;
294 
295         while (len > 0) {
296             if (inf.needsInput()) {
297                 // Remember the current position because we may need to
298                 // rewind after reading too much input.
299                 in.mark(buf.length);
300 
301                 bufUsed = in.read(buf);
302                 if (bufUsed == -1) {
303                     throw new EOFException();
304                 }
305 
306                 inf.setInput(buf, 0, bufUsed);
307             }
308 
309             int ret;
310             try {
311                 ret = inf.inflate(b, off, len);
312             } catch (final DataFormatException e) {
313                 throw new IOException("Gzip-compressed data is corrupt");
314             }
315 
316             crc.update(b, off, ret);
317             off += ret;
318             len -= ret;
319             size += ret;
320             count(ret);
321 
322             if (inf.finished()) {
323                 // We may have read too many bytes. Rewind the read
324                 // position to match the actual amount used.
325                 //
326                 // NOTE: The "if" is there just in case. Since we used
327                 // in.mark earlier, it should always skip enough.
328                 in.reset();
329 
330                 final int skipAmount = bufUsed - inf.getRemaining();
331                 if (IOUtils.skip(in, skipAmount) != skipAmount) {
332                     throw new IOException();
333                 }
334 
335                 bufUsed = 0;
336 
337                 final DataInput inData = new DataInputStream(in);
338 
339                 // CRC32
340                 final long crcStored = ByteUtils.fromLittleEndian(inData, 4);
341 
342                 if (crcStored != crc.getValue()) {
343                     throw new IOException("Gzip-compressed data is corrupt "
344                                           + "(CRC32 error)");
345                 }
346 
347                 // Uncompressed size modulo 2^32 (ISIZE in the spec)
348                 final long isize = ByteUtils.fromLittleEndian(inData, 4);
349 
350                 if (isize != (inf.getBytesWritten() & 0xffffffffL)) {
351                     throw new IOException("Gzip-compressed data is corrupt"
352                                           + "(uncompressed size mismatch)");
353                 }
354 
355                 // See if this is the end of the file.
356                 if (!decompressConcatenated || !init(false)) {
357                     inf.end();
358                     inf = null;
359                     endReached = true;
360                     return size == 0 ? -1 : size;
361                 }
362             }
363         }
364 
365         return size;
366     }
367 
368     /**
369      * Checks if the signature matches what is expected for a .gz file.
370      *
371      * @param signature the bytes to check
372      * @param length    the number of bytes to check
373      * @return          true if this is a .gz stream, false otherwise
374      *
375      * @since 1.1
376      */
377     public static boolean matches(final byte[] signature, final int length) {
378         return length >= 2 && signature[0] == 31 && signature[1] == -117;
379     }
380 
381     /**
382      * Closes the input stream (unless it is System.in).
383      *
384      * @since 1.2
385      */
386     @Override
387     public void close() throws IOException {
388         if (inf != null) {
389             inf.end();
390             inf = null;
391         }
392 
393         if (this.in != System.in) {
394             this.in.close();
395         }
396     }
397 
398     /**
399      * @since 1.17
400      */
401     @Override
402     public long getCompressedCount() {
403         return countingStream.getBytesRead();
404     }
405 }