View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   * http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.commons.compress.compressors.gzip;
20  
21  import java.io.ByteArrayOutputStream;
22  import java.io.IOException;
23  import java.io.EOFException;
24  import java.io.InputStream;
25  import java.io.DataInputStream;
26  import java.io.BufferedInputStream;
27  import java.util.zip.DataFormatException;
28  import java.util.zip.Deflater;
29  import java.util.zip.Inflater;
30  import java.util.zip.CRC32;
31  
32  import org.apache.commons.compress.compressors.CompressorInputStream;
33  import org.apache.commons.compress.utils.CharsetNames;
34  
35  /**
36   * Input stream that decompresses .gz files.
37   * This supports decompressing concatenated .gz files which is important
38   * when decompressing standalone .gz files.
39   * <p>
40   * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz
41   * files: it stops after the first member and silently ignores the rest.
42   * It doesn't leave the read position to point to the beginning of the next
43   * member, which makes it difficult workaround the lack of concatenation
44   * support.
45   * <p>
46   * Instead of using <code>GZIPInputStream</code>, this class has its own .gz
47   * container format decoder. The actual decompression is done with
48   * {@link java.util.zip.Inflater}.
49   */
50  public class GzipCompressorInputStream extends CompressorInputStream {
51      // Header flags
52      // private static final int FTEXT = 0x01; // Uninteresting for us
53      private static final int FHCRC = 0x02;
54      private static final int FEXTRA = 0x04;
55      private static final int FNAME = 0x08;
56      private static final int FCOMMENT = 0x10;
57      private static final int FRESERVED = 0xE0;
58  
59      // Compressed input stream, possibly wrapped in a BufferedInputStream
60      private final InputStream in;
61  
62      // True if decompressing multimember streams.
63      private final boolean decompressConcatenated;
64  
65      // Buffer to hold the input data
66      private final byte[] buf = new byte[8192];
67  
68      // Amount of data in buf.
69      private int bufUsed = 0;
70  
71      // Decompressor
72      private Inflater inf = new Inflater(true);
73  
74      // CRC32 from uncompressed data
75      private final CRC32 crc = new CRC32();
76  
77      // True once everything has been decompressed
78      private boolean endReached = false;
79  
80      // used in no-arg read method
81      private final byte[] oneByte = new byte[1];
82  
83      private final GzipParameters parameters = new GzipParameters();
84  
85      /**
86       * Constructs a new input stream that decompresses gzip-compressed data
87       * from the specified input stream.
88       * <p>
89       * This is equivalent to
90       * <code>GzipCompressorInputStream(inputStream, false)</code> and thus
91       * will not decompress concatenated .gz files.
92       *
93       * @param inputStream  the InputStream from which this object should
94       *                     be created of
95       *
96       * @throws IOException if the stream could not be created
97       */
98      public GzipCompressorInputStream(InputStream inputStream)
99              throws IOException {
100         this(inputStream, false);
101     }
102 
103     /**
104      * Constructs a new input stream that decompresses gzip-compressed data
105      * from the specified input stream.
106      * <p>
107      * If <code>decompressConcatenated</code> is {@code false}:
108      * This decompressor might read more input than it will actually use.
109      * If <code>inputStream</code> supports <code>mark</code> and
110      * <code>reset</code>, then the input position will be adjusted
111      * so that it is right after the last byte of the compressed stream.
112      * If <code>mark</code> isn't supported, the input position will be
113      * undefined.
114      *
115      * @param inputStream  the InputStream from which this object should
116      *                     be created of
117      * @param decompressConcatenated
118      *                     if true, decompress until the end of the input;
119      *                     if false, stop after the first .gz member
120      *
121      * @throws IOException if the stream could not be created
122      */
123     public GzipCompressorInputStream(InputStream inputStream,
124                                      boolean decompressConcatenated)
125             throws IOException {
126         // Mark support is strictly needed for concatenated files only,
127         // but it's simpler if it is always available.
128         if (inputStream.markSupported()) {
129             in = inputStream;
130         } else {
131             in = new BufferedInputStream(inputStream);
132         }
133 
134         this.decompressConcatenated = decompressConcatenated;
135         init(true);
136     }
137 
138     /**
139      * Provides the stream's meta data - may change with each stream
140      * when decompressing concatenated streams.
141      * @return the stream's meta data
142      * @since 1.8
143      */
144     public GzipParameters getMetaData() {
145         return parameters;
146     }
147 
148     private boolean init(boolean isFirstMember) throws IOException {
149         assert isFirstMember || decompressConcatenated;
150 
151         // Check the magic bytes without a possibility of EOFException.
152         int magic0 = in.read();
153         int magic1 = in.read();
154 
155         // If end of input was reached after decompressing at least
156         // one .gz member, we have reached the end of the file successfully.
157         if (magic0 == -1 && !isFirstMember) {
158             return false;
159         }
160 
161         if (magic0 != 31 || magic1 != 139) {
162             throw new IOException(isFirstMember
163                                   ? "Input is not in the .gz format"
164                                   : "Garbage after a valid .gz stream");
165         }
166 
167         // Parsing the rest of the header may throw EOFException.
168         DataInputStream inData = new DataInputStream(in);
169         int method = inData.readUnsignedByte();
170         if (method != Deflater.DEFLATED) {
171             throw new IOException("Unsupported compression method "
172                                   + method + " in the .gz header");
173         }
174 
175         int flg = inData.readUnsignedByte();
176         if ((flg & FRESERVED) != 0) {
177             throw new IOException(
178                     "Reserved flags are set in the .gz header");
179         }
180 
181         parameters.setModificationTime(readLittleEndianInt(inData) * 1000);
182         switch (inData.readUnsignedByte()) { // extra flags
183         case 2:
184             parameters.setCompressionLevel(Deflater.BEST_COMPRESSION);
185             break;
186         case 4:
187             parameters.setCompressionLevel(Deflater.BEST_SPEED);
188             break;
189         default:
190             // ignored for now
191             break;
192         }
193         parameters.setOperatingSystem(inData.readUnsignedByte());
194 
195         // Extra field, ignored
196         if ((flg & FEXTRA) != 0) {
197             int xlen = inData.readUnsignedByte();
198             xlen |= inData.readUnsignedByte() << 8;
199 
200             // This isn't as efficient as calling in.skip would be,
201             // but it's lazier to handle unexpected end of input this way.
202             // Most files don't have an extra field anyway.
203             while (xlen-- > 0) {
204                 inData.readUnsignedByte();
205             }
206         }
207 
208         // Original file name
209         if ((flg & FNAME) != 0) {
210             parameters.setFilename(new String(readToNull(inData),
211                                               CharsetNames.ISO_8859_1));
212         }
213 
214         // Comment
215         if ((flg & FCOMMENT) != 0) {
216             parameters.setComment(new String(readToNull(inData),
217                                              CharsetNames.ISO_8859_1));
218         }
219 
220         // Header "CRC16" which is actually a truncated CRC32 (which isn't
221         // as good as real CRC16). I don't know if any encoder implementation
222         // sets this, so it's not worth trying to verify it. GNU gzip 1.4
223         // doesn't support this field, but zlib seems to be able to at least
224         // skip over it.
225         if ((flg & FHCRC) != 0) {
226             inData.readShort();
227         }
228 
229         // Reset
230         inf.reset();
231         crc.reset();
232 
233         return true;
234     }
235 
236     private byte[] readToNull(DataInputStream inData) throws IOException {
237         ByteArrayOutputStream bos = new ByteArrayOutputStream();
238         int b = 0;
239         while ((b = inData.readUnsignedByte()) != 0x00) { // NOPMD
240             bos.write(b);
241         }
242         return bos.toByteArray();
243     }
244 
245     private long readLittleEndianInt(DataInputStream inData) throws IOException {
246         return inData.readUnsignedByte()
247             | (inData.readUnsignedByte() << 8)
248             | (inData.readUnsignedByte() << 16)
249             | (((long) inData.readUnsignedByte()) << 24);
250     }
251 
252     @Override
253     public int read() throws IOException {
254         return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
255     }
256 
257     /**
258      * {@inheritDoc}
259      *
260      * @since 1.1
261      */
262     @Override
263     public int read(byte[] b, int off, int len) throws IOException {
264         if (endReached) {
265             return -1;
266         }
267 
268         int size = 0;
269 
270         while (len > 0) {
271             if (inf.needsInput()) {
272                 // Remember the current position because we may need to
273                 // rewind after reading too much input.
274                 in.mark(buf.length);
275 
276                 bufUsed = in.read(buf);
277                 if (bufUsed == -1) {
278                     throw new EOFException();
279                 }
280 
281                 inf.setInput(buf, 0, bufUsed);
282             }
283 
284             int ret;
285             try {
286                 ret = inf.inflate(b, off, len);
287             } catch (DataFormatException e) {
288                 throw new IOException("Gzip-compressed data is corrupt");
289             }
290 
291             crc.update(b, off, ret);
292             off += ret;
293             len -= ret;
294             size += ret;
295             count(ret);
296 
297             if (inf.finished()) {
298                 // We may have read too many bytes. Rewind the read
299                 // position to match the actual amount used.
300                 //
301                 // NOTE: The "if" is there just in case. Since we used
302                 // in.mark earler, it should always skip enough.
303                 in.reset();
304 
305                 int skipAmount = bufUsed - inf.getRemaining();
306                 if (in.skip(skipAmount) != skipAmount) {
307                     throw new IOException();
308                 }
309 
310                 bufUsed = 0;
311 
312                 DataInputStream inData = new DataInputStream(in);
313 
314                 // CRC32
315                 long crcStored = readLittleEndianInt(inData);
316 
317                 if (crcStored != crc.getValue()) {
318                     throw new IOException("Gzip-compressed data is corrupt "
319                                           + "(CRC32 error)");
320                 }
321 
322                 // Uncompressed size modulo 2^32 (ISIZE in the spec)
323                 long isize = readLittleEndianInt(inData);
324 
325                 if (isize != (inf.getBytesWritten() & 0xffffffffl)) {
326                     throw new IOException("Gzip-compressed data is corrupt"
327                                           + "(uncompressed size mismatch)");
328                 }
329 
330                 // See if this is the end of the file.
331                 if (!decompressConcatenated || !init(false)) {
332                     inf.end();
333                     inf = null;
334                     endReached = true;
335                     return size == 0 ? -1 : size;
336                 }
337             }
338         }
339 
340         return size;
341     }
342 
343     /**
344      * Checks if the signature matches what is expected for a .gz file.
345      *
346      * @param signature the bytes to check
347      * @param length    the number of bytes to check
348      * @return          true if this is a .gz stream, false otherwise
349      *
350      * @since 1.1
351      */
352     public static boolean matches(byte[] signature, int length) {
353 
354         if (length < 2) {
355             return false;
356         }
357 
358         if (signature[0] != 31) {
359             return false;
360         }
361 
362         if (signature[1] != -117) {
363             return false;
364         }
365 
366         return true;
367     }
368 
369     /**
370      * Closes the input stream (unless it is System.in).
371      *
372      * @since 1.2
373      */
374     @Override
375     public void close() throws IOException {
376         if (inf != null) {
377             inf.end();
378             inf = null;
379         }
380 
381         if (this.in != System.in) {
382             this.in.close();
383         }
384     }
385 }