View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   * http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.commons.compress.compressors.gzip;
20  
21  import java.io.ByteArrayOutputStream;
22  import java.io.IOException;
23  import java.io.EOFException;
24  import java.io.InputStream;
25  import java.io.DataInputStream;
26  import java.io.BufferedInputStream;
27  import java.util.zip.DataFormatException;
28  import java.util.zip.Deflater;
29  import java.util.zip.Inflater;
30  import java.util.zip.CRC32;
31  
32  import org.apache.commons.compress.compressors.CompressorInputStream;
33  import org.apache.commons.compress.utils.CharsetNames;
34  
35  /**
36   * Input stream that decompresses .gz files.
37   * This supports decompressing concatenated .gz files which is important
38   * when decompressing standalone .gz files.
39   * <p>
40   * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz
41   * files: it stops after the first member and silently ignores the rest.
42   * It doesn't leave the read position to point to the beginning of the next
43   * member, which makes it difficult workaround the lack of concatenation
44   * support.
45   * <p>
46   * Instead of using <code>GZIPInputStream</code>, this class has its own .gz
47   * container format decoder. The actual decompression is done with
48   * {@link java.util.zip.Inflater}.
49   */
50  public class GzipCompressorInputStream extends CompressorInputStream {
51      // Header flags
52      // private static final int FTEXT = 0x01; // Uninteresting for us
53      private static final int FHCRC = 0x02;
54      private static final int FEXTRA = 0x04;
55      private static final int FNAME = 0x08;
56      private static final int FCOMMENT = 0x10;
57      private static final int FRESERVED = 0xE0;
58  
59      // Compressed input stream, possibly wrapped in a BufferedInputStream
60      private final InputStream in;
61  
62      // True if decompressing multimember streams.
63      private final boolean decompressConcatenated;
64  
65      // Buffer to hold the input data
66      private final byte[] buf = new byte[8192];
67  
68      // Amount of data in buf.
69      private int bufUsed = 0;
70  
71      // Decompressor
72      private Inflater inf = new Inflater(true);
73  
74      // CRC32 from uncompressed data
75      private final CRC32 crc = new CRC32();
76  
77      private int memberSize;
78  
79      // True once everything has been decompressed
80      private boolean endReached = false;
81  
82      // used in no-arg read method
83      private final byte[] oneByte = new byte[1];
84  
85      private final GzipParameters parameters = new GzipParameters();
86  
87      /**
88       * Constructs a new input stream that decompresses gzip-compressed data
89       * from the specified input stream.
90       * <p>
91       * This is equivalent to
92       * <code>GzipCompressorInputStream(inputStream, false)</code> and thus
93       * will not decompress concatenated .gz files.
94       *
95       * @param inputStream  the InputStream from which this object should
96       *                     be created of
97       *
98       * @throws IOException if the stream could not be created
99       */
100     public GzipCompressorInputStream(InputStream inputStream)
101             throws IOException {
102         this(inputStream, false);
103     }
104 
105     /**
106      * Constructs a new input stream that decompresses gzip-compressed data
107      * from the specified input stream.
108      * <p>
109      * If <code>decompressConcatenated</code> is {@code false}:
110      * This decompressor might read more input than it will actually use.
111      * If <code>inputStream</code> supports <code>mark</code> and
112      * <code>reset</code>, then the input position will be adjusted
113      * so that it is right after the last byte of the compressed stream.
114      * If <code>mark</code> isn't supported, the input position will be
115      * undefined.
116      *
117      * @param inputStream  the InputStream from which this object should
118      *                     be created of
119      * @param decompressConcatenated
120      *                     if true, decompress until the end of the input;
121      *                     if false, stop after the first .gz member
122      *
123      * @throws IOException if the stream could not be created
124      */
125     public GzipCompressorInputStream(InputStream inputStream,
126                                      boolean decompressConcatenated)
127             throws IOException {
128         // Mark support is strictly needed for concatenated files only,
129         // but it's simpler if it is always available.
130         if (inputStream.markSupported()) {
131             in = inputStream;
132         } else {
133             in = new BufferedInputStream(inputStream);
134         }
135 
136         this.decompressConcatenated = decompressConcatenated;
137         init(true);
138     }
139 
140     /**
141      * Provides the stream's meta data - may change with each stream
142      * when decompressing concatenated streams.
143      * @return the stream's meta data
144      * @since 1.8
145      */
146     public GzipParameters getMetaData() {
147         return parameters;
148     }
149 
150     private boolean init(boolean isFirstMember) throws IOException {
151         assert isFirstMember || decompressConcatenated;
152 
153         // Check the magic bytes without a possibility of EOFException.
154         int magic0 = in.read();
155         int magic1 = in.read();
156 
157         // If end of input was reached after decompressing at least
158         // one .gz member, we have reached the end of the file successfully.
159         if (magic0 == -1 && !isFirstMember) {
160             return false;
161         }
162 
163         if (magic0 != 31 || magic1 != 139) {
164             throw new IOException(isFirstMember
165                                   ? "Input is not in the .gz format"
166                                   : "Garbage after a valid .gz stream");
167         }
168 
169         // Parsing the rest of the header may throw EOFException.
170         DataInputStream inData = new DataInputStream(in);
171         int method = inData.readUnsignedByte();
172         if (method != Deflater.DEFLATED) {
173             throw new IOException("Unsupported compression method "
174                                   + method + " in the .gz header");
175         }
176 
177         int flg = inData.readUnsignedByte();
178         if ((flg & FRESERVED) != 0) {
179             throw new IOException(
180                     "Reserved flags are set in the .gz header");
181         }
182 
183         parameters.setModificationTime(readLittleEndianInt(inData) * 1000);
184         switch (inData.readUnsignedByte()) { // extra flags
185         case 2:
186             parameters.setCompressionLevel(Deflater.BEST_COMPRESSION);
187             break;
188         case 4:
189             parameters.setCompressionLevel(Deflater.BEST_SPEED);
190             break;
191         default:
192             // ignored for now
193             break;
194         }
195         parameters.setOperatingSystem(inData.readUnsignedByte());
196 
197         // Extra field, ignored
198         if ((flg & FEXTRA) != 0) {
199             int xlen = inData.readUnsignedByte();
200             xlen |= inData.readUnsignedByte() << 8;
201 
202             // This isn't as efficient as calling in.skip would be,
203             // but it's lazier to handle unexpected end of input this way.
204             // Most files don't have an extra field anyway.
205             while (xlen-- > 0) {
206                 inData.readUnsignedByte();
207             }
208         }
209 
210         // Original file name
211         if ((flg & FNAME) != 0) {
212             parameters.setFilename(new String(readToNull(inData),
213                                               CharsetNames.ISO_8859_1));
214         }
215 
216         // Comment
217         if ((flg & FCOMMENT) != 0) {
218             parameters.setComment(new String(readToNull(inData),
219                                              CharsetNames.ISO_8859_1));
220         }
221 
222         // Header "CRC16" which is actually a truncated CRC32 (which isn't
223         // as good as real CRC16). I don't know if any encoder implementation
224         // sets this, so it's not worth trying to verify it. GNU gzip 1.4
225         // doesn't support this field, but zlib seems to be able to at least
226         // skip over it.
227         if ((flg & FHCRC) != 0) {
228             inData.readShort();
229         }
230 
231         // Reset
232         inf.reset();
233         crc.reset();
234         memberSize = 0;
235 
236         return true;
237     }
238 
239     private byte[] readToNull(DataInputStream inData) throws IOException {
240         ByteArrayOutputStream bos = new ByteArrayOutputStream();
241         int b = 0;
242         while ((b = inData.readUnsignedByte()) != 0x00) { // NOPMD
243             bos.write(b);
244         }
245         return bos.toByteArray();
246     }
247 
248     private int readLittleEndianInt(DataInputStream inData) throws IOException {
249         return inData.readUnsignedByte()
250             | (inData.readUnsignedByte() << 8)
251             | (inData.readUnsignedByte() << 16)
252             | (inData.readUnsignedByte() << 24);
253     }
254 
255     @Override
256     public int read() throws IOException {
257         return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
258     }
259 
260     /**
261      * {@inheritDoc}
262      *
263      * @since 1.1
264      */
265     @Override
266     public int read(byte[] b, int off, int len) throws IOException {
267         if (endReached) {
268             return -1;
269         }
270 
271         int size = 0;
272 
273         while (len > 0) {
274             if (inf.needsInput()) {
275                 // Remember the current position because we may need to
276                 // rewind after reading too much input.
277                 in.mark(buf.length);
278 
279                 bufUsed = in.read(buf);
280                 if (bufUsed == -1) {
281                     throw new EOFException();
282                 }
283 
284                 inf.setInput(buf, 0, bufUsed);
285             }
286 
287             int ret;
288             try {
289                 ret = inf.inflate(b, off, len);
290             } catch (DataFormatException e) {
291                 throw new IOException("Gzip-compressed data is corrupt");
292             }
293 
294             crc.update(b, off, ret);
295             memberSize += ret;
296             off += ret;
297             len -= ret;
298             size += ret;
299             count(ret);
300 
301             if (inf.finished()) {
302                 // We may have read too many bytes. Rewind the read
303                 // position to match the actual amount used.
304                 //
305                 // NOTE: The "if" is there just in case. Since we used
306                 // in.mark earler, it should always skip enough.
307                 in.reset();
308 
309                 int skipAmount = bufUsed - inf.getRemaining();
310                 if (in.skip(skipAmount) != skipAmount) {
311                     throw new IOException();
312                 }
313 
314                 bufUsed = 0;
315 
316                 DataInputStream inData = new DataInputStream(in);
317 
318                 // CRC32
319                 long crcStored = 0;
320                 for (int i = 0; i < 4; ++i) {
321                     crcStored |= (long)inData.readUnsignedByte() << (i * 8);
322                 }
323 
324                 if (crcStored != crc.getValue()) {
325                     throw new IOException("Gzip-compressed data is corrupt "
326                                           + "(CRC32 error)");
327                 }
328 
329                 // Uncompressed size modulo 2^32 (ISIZE in the spec)
330                 int isize = 0;
331                 for (int i = 0; i < 4; ++i) {
332                     isize |= inData.readUnsignedByte() << (i * 8);
333                 }
334 
335                 if (isize != memberSize) {
336                     throw new IOException("Gzip-compressed data is corrupt"
337                                           + "(uncompressed size mismatch)");
338                 }
339 
340                 // See if this is the end of the file.
341                 if (!decompressConcatenated || !init(false)) {
342                     inf.end();
343                     inf = null;
344                     endReached = true;
345                     return size == 0 ? -1 : size;
346                 }
347             }
348         }
349 
350         return size;
351     }
352 
353     /**
354      * Checks if the signature matches what is expected for a .gz file.
355      *
356      * @param signature the bytes to check
357      * @param length    the number of bytes to check
358      * @return          true if this is a .gz stream, false otherwise
359      *
360      * @since 1.1
361      */
362     public static boolean matches(byte[] signature, int length) {
363 
364         if (length < 2) {
365             return false;
366         }
367 
368         if (signature[0] != 31) {
369             return false;
370         }
371 
372         if (signature[1] != -117) {
373             return false;
374         }
375 
376         return true;
377     }
378 
379     /**
380      * Closes the input stream (unless it is System.in).
381      *
382      * @since 1.2
383      */
384     @Override
385     public void close() throws IOException {
386         if (inf != null) {
387             inf.end();
388             inf = null;
389         }
390 
391         if (this.in != System.in) {
392             this.in.close();
393         }
394     }
395 }