View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   * http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.commons.compress.compressors.gzip;
20  
21  import java.io.ByteArrayOutputStream;
22  import java.io.IOException;
23  import java.io.EOFException;
24  import java.io.InputStream;
25  import java.io.DataInput;
26  import java.io.DataInputStream;
27  import java.io.BufferedInputStream;
28  import java.util.zip.DataFormatException;
29  import java.util.zip.Deflater;
30  import java.util.zip.Inflater;
31  import java.util.zip.CRC32;
32  
33  import org.apache.commons.compress.compressors.CompressorInputStream;
34  import org.apache.commons.compress.utils.ByteUtils;
35  import org.apache.commons.compress.utils.CharsetNames;
36  
37  /**
38   * Input stream that decompresses .gz files.
39   *
40   * <p>This supports decompressing concatenated .gz files which is important
41   * when decompressing standalone .gz files.</p>
42   *
43   * <p>
44   * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz
45   * files: it stops after the first member and silently ignores the rest.
46   * It doesn't leave the read position to point to the beginning of the next
47   * member, which makes it difficult workaround the lack of concatenation
48   * support.
49   * </p>
50   *
51   * <p>
52   * Instead of using <code>GZIPInputStream</code>, this class has its own .gz
53   * container format decoder. The actual decompression is done with
54   * {@link java.util.zip.Inflater}.
55   * </p>
56   *
57   * <p>If you use the constructor {@code GzipCompressorInputStream(in)}
58   * or {@code GzipCompressorInputStream(in, false)} with some {@code
59   * InputStream} {@code in} then {@link #read} will return -1 as soon
60   * as the first internal member has been read completely. The stream
61   * {@code in} will be positioned at the start of the second gzip
62   * member if there is one.</p>
63   *
64   * <p>If you use the constructor {@code GzipCompressorInputStream(in,
65   * true)} with some {@code InputStream} {@code in} then {@link #read}
66   * will return -1 once the stream {@code in} has been exhausted. The
67   * data read from a stream constructed this way will consist of the
68   * concatenated data of all gzip members contained inside {@code
69   * in}.</p>
70   *
71   * @see "https://tools.ietf.org/html/rfc1952"
72   */
73  public class GzipCompressorInputStream extends CompressorInputStream {
74      // Header flags
75      // private static final int FTEXT = 0x01; // Uninteresting for us
76      private static final int FHCRC = 0x02;
77      private static final int FEXTRA = 0x04;
78      private static final int FNAME = 0x08;
79      private static final int FCOMMENT = 0x10;
80      private static final int FRESERVED = 0xE0;
81  
82      // Compressed input stream, possibly wrapped in a BufferedInputStream
83      private final InputStream in;
84  
85      // True if decompressing multi member streams.
86      private final boolean decompressConcatenated;
87  
88      // Buffer to hold the input data
89      private final byte[] buf = new byte[8192];
90  
91      // Amount of data in buf.
92      private int bufUsed;
93  
94      // Decompressor
95      private Inflater inf = new Inflater(true);
96  
97      // CRC32 from uncompressed data
98      private final CRC32 crc = new CRC32();
99  
100     // True once everything has been decompressed
101     private boolean endReached = false;
102 
103     // used in no-arg read method
104     private final byte[] oneByte = new byte[1];
105 
106     private final GzipParameters parameters = new GzipParameters();
107 
108     /**
109      * Constructs a new input stream that decompresses gzip-compressed data
110      * from the specified input stream.
111      * <p>
112      * This is equivalent to
113      * <code>GzipCompressorInputStream(inputStream, false)</code> and thus
114      * will not decompress concatenated .gz files.
115      *
116      * @param inputStream  the InputStream from which this object should
117      *                     be created of
118      *
119      * @throws IOException if the stream could not be created
120      */
121     public GzipCompressorInputStream(final InputStream inputStream)
122             throws IOException {
123         this(inputStream, false);
124     }
125 
126     /**
127      * Constructs a new input stream that decompresses gzip-compressed data
128      * from the specified input stream.
129      * <p>
130      * If <code>decompressConcatenated</code> is {@code false}:
131      * This decompressor might read more input than it will actually use.
132      * If <code>inputStream</code> supports <code>mark</code> and
133      * <code>reset</code>, then the input position will be adjusted
134      * so that it is right after the last byte of the compressed stream.
135      * If <code>mark</code> isn't supported, the input position will be
136      * undefined.
137      *
138      * @param inputStream  the InputStream from which this object should
139      *                     be created of
140      * @param decompressConcatenated
141      *                     if true, decompress until the end of the input;
142      *                     if false, stop after the first .gz member
143      *
144      * @throws IOException if the stream could not be created
145      */
146     public GzipCompressorInputStream(final InputStream inputStream,
147                                      final boolean decompressConcatenated)
148             throws IOException {
149         // Mark support is strictly needed for concatenated files only,
150         // but it's simpler if it is always available.
151         if (inputStream.markSupported()) {
152             in = inputStream;
153         } else {
154             in = new BufferedInputStream(inputStream);
155         }
156 
157         this.decompressConcatenated = decompressConcatenated;
158         init(true);
159     }
160 
161     /**
162      * Provides the stream's meta data - may change with each stream
163      * when decompressing concatenated streams.
164      * @return the stream's meta data
165      * @since 1.8
166      */
167     public GzipParameters getMetaData() {
168         return parameters;
169     }
170 
171     private boolean init(final boolean isFirstMember) throws IOException {
172         assert isFirstMember || decompressConcatenated;
173 
174         // Check the magic bytes without a possibility of EOFException.
175         final int magic0 = in.read();
176         final int magic1 = in.read();
177 
178         // If end of input was reached after decompressing at least
179         // one .gz member, we have reached the end of the file successfully.
180         if (magic0 == -1 && !isFirstMember) {
181             return false;
182         }
183 
184         if (magic0 != 31 || magic1 != 139) {
185             throw new IOException(isFirstMember
186                                   ? "Input is not in the .gz format"
187                                   : "Garbage after a valid .gz stream");
188         }
189 
190         // Parsing the rest of the header may throw EOFException.
191         final DataInput inData = new DataInputStream(in);
192         final int method = inData.readUnsignedByte();
193         if (method != Deflater.DEFLATED) {
194             throw new IOException("Unsupported compression method "
195                                   + method + " in the .gz header");
196         }
197 
198         final int flg = inData.readUnsignedByte();
199         if ((flg & FRESERVED) != 0) {
200             throw new IOException(
201                     "Reserved flags are set in the .gz header");
202         }
203 
204         parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4) * 1000);
205         switch (inData.readUnsignedByte()) { // extra flags
206         case 2:
207             parameters.setCompressionLevel(Deflater.BEST_COMPRESSION);
208             break;
209         case 4:
210             parameters.setCompressionLevel(Deflater.BEST_SPEED);
211             break;
212         default:
213             // ignored for now
214             break;
215         }
216         parameters.setOperatingSystem(inData.readUnsignedByte());
217 
218         // Extra field, ignored
219         if ((flg & FEXTRA) != 0) {
220             int xlen = inData.readUnsignedByte();
221             xlen |= inData.readUnsignedByte() << 8;
222 
223             // This isn't as efficient as calling in.skip would be,
224             // but it's lazier to handle unexpected end of input this way.
225             // Most files don't have an extra field anyway.
226             while (xlen-- > 0) {
227                 inData.readUnsignedByte();
228             }
229         }
230 
231         // Original file name
232         if ((flg & FNAME) != 0) {
233             parameters.setFilename(new String(readToNull(inData),
234                                               CharsetNames.ISO_8859_1));
235         }
236 
237         // Comment
238         if ((flg & FCOMMENT) != 0) {
239             parameters.setComment(new String(readToNull(inData),
240                                              CharsetNames.ISO_8859_1));
241         }
242 
243         // Header "CRC16" which is actually a truncated CRC32 (which isn't
244         // as good as real CRC16). I don't know if any encoder implementation
245         // sets this, so it's not worth trying to verify it. GNU gzip 1.4
246         // doesn't support this field, but zlib seems to be able to at least
247         // skip over it.
248         if ((flg & FHCRC) != 0) {
249             inData.readShort();
250         }
251 
252         // Reset
253         inf.reset();
254         crc.reset();
255 
256         return true;
257     }
258 
259     private static byte[] readToNull(final DataInput inData) throws IOException {
260         final ByteArrayOutputStream bos = new ByteArrayOutputStream();
261         int b = 0;
262         while ((b = inData.readUnsignedByte()) != 0x00) { // NOPMD
263             bos.write(b);
264         }
265         return bos.toByteArray();
266     }
267 
268     @Override
269     public int read() throws IOException {
270         return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
271     }
272 
273     /**
274      * {@inheritDoc}
275      *
276      * @since 1.1
277      */
278     @Override
279     public int read(final byte[] b, int off, int len) throws IOException {
280         if (endReached) {
281             return -1;
282         }
283 
284         int size = 0;
285 
286         while (len > 0) {
287             if (inf.needsInput()) {
288                 // Remember the current position because we may need to
289                 // rewind after reading too much input.
290                 in.mark(buf.length);
291 
292                 bufUsed = in.read(buf);
293                 if (bufUsed == -1) {
294                     throw new EOFException();
295                 }
296 
297                 inf.setInput(buf, 0, bufUsed);
298             }
299 
300             int ret;
301             try {
302                 ret = inf.inflate(b, off, len);
303             } catch (final DataFormatException e) {
304                 throw new IOException("Gzip-compressed data is corrupt");
305             }
306 
307             crc.update(b, off, ret);
308             off += ret;
309             len -= ret;
310             size += ret;
311             count(ret);
312 
313             if (inf.finished()) {
314                 // We may have read too many bytes. Rewind the read
315                 // position to match the actual amount used.
316                 //
317                 // NOTE: The "if" is there just in case. Since we used
318                 // in.mark earlier, it should always skip enough.
319                 in.reset();
320 
321                 final int skipAmount = bufUsed - inf.getRemaining();
322                 if (in.skip(skipAmount) != skipAmount) {
323                     throw new IOException();
324                 }
325 
326                 bufUsed = 0;
327 
328                 final DataInput inData = new DataInputStream(in);
329 
330                 // CRC32
331                 final long crcStored = ByteUtils.fromLittleEndian(inData, 4);
332 
333                 if (crcStored != crc.getValue()) {
334                     throw new IOException("Gzip-compressed data is corrupt "
335                                           + "(CRC32 error)");
336                 }
337 
338                 // Uncompressed size modulo 2^32 (ISIZE in the spec)
339                 final long isize = ByteUtils.fromLittleEndian(inData, 4);
340 
341                 if (isize != (inf.getBytesWritten() & 0xffffffffL)) {
342                     throw new IOException("Gzip-compressed data is corrupt"
343                                           + "(uncompressed size mismatch)");
344                 }
345 
346                 // See if this is the end of the file.
347                 if (!decompressConcatenated || !init(false)) {
348                     inf.end();
349                     inf = null;
350                     endReached = true;
351                     return size == 0 ? -1 : size;
352                 }
353             }
354         }
355 
356         return size;
357     }
358 
359     /**
360      * Checks if the signature matches what is expected for a .gz file.
361      *
362      * @param signature the bytes to check
363      * @param length    the number of bytes to check
364      * @return          true if this is a .gz stream, false otherwise
365      *
366      * @since 1.1
367      */
368     public static boolean matches(final byte[] signature, final int length) {
369         return length >= 2 && signature[0] == 31 && signature[1] == -117;
370     }
371 
372     /**
373      * Closes the input stream (unless it is System.in).
374      *
375      * @since 1.2
376      */
377     @Override
378     public void close() throws IOException {
379         if (inf != null) {
380             inf.end();
381             inf = null;
382         }
383 
384         if (this.in != System.in) {
385             this.in.close();
386         }
387     }
388 }