001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.gzip;
020
021import java.io.BufferedInputStream;
022import java.io.ByteArrayOutputStream;
023import java.io.DataInput;
024import java.io.DataInputStream;
025import java.io.EOFException;
026import java.io.IOException;
027import java.io.InputStream;
028import java.util.zip.CRC32;
029import java.util.zip.DataFormatException;
030import java.util.zip.Deflater;
031import java.util.zip.Inflater;
032
033import org.apache.commons.compress.compressors.CompressorInputStream;
034import org.apache.commons.compress.utils.ByteUtils;
035import org.apache.commons.compress.utils.InputStreamStatistics;
036import org.apache.commons.io.input.CountingInputStream;
037
038/**
039 * Input stream that decompresses .gz files.
040 *
041 * <p>
042 * This supports decompressing concatenated .gz files which is important when decompressing standalone .gz files.
043 * </p>
044 *
045 * <p>
046 * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz files: it stops after the first member and silently ignores the rest. It doesn't
047 * leave the read position to point to the beginning of the next member, which makes it difficult workaround the lack of concatenation support.
048 * </p>
049 *
050 * <p>
051 * Instead of using {@code GZIPInputStream}, this class has its own .gz container format decoder. The actual decompression is done with
052 * {@link java.util.zip.Inflater}.
053 * </p>
054 *
055 * <p>
056 * If you use the constructor {@code GzipCompressorInputStream(in)} or {@code GzipCompressorInputStream(in, false)} with some {@code
057 * InputStream} {@code in} then {@link #read} will return -1 as soon as the first internal member has been read completely. The stream {@code in} will be
058 * positioned at the start of the second gzip member if there is one.
059 * </p>
060 *
061 * <p>
062 * If you use the constructor {@code GzipCompressorInputStream(in,
063 * true)} with some {@code InputStream} {@code in} then {@link #read} will return -1 once the stream {@code in} has been exhausted. The data read from a stream
064 * constructed this way will consist of the concatenated data of all gzip members contained inside {@code
065 * in}.
066 * </p>
067 *
068 * @see "https://tools.ietf.org/html/rfc1952"
069 */
070public class GzipCompressorInputStream extends CompressorInputStream implements InputStreamStatistics {
071
072    // Header flags
073    // private static final int FTEXT = 0x01; // Uninteresting for us
074    private static final int FHCRC = 0x02;
075    private static final int FEXTRA = 0x04;
076    private static final int FNAME = 0x08;
077    private static final int FCOMMENT = 0x10;
078    private static final int FRESERVED = 0xE0;
079
080    /**
081     * Checks if the signature matches what is expected for a .gz file.
082     *
083     * @param signature the bytes to check
084     * @param length    the number of bytes to check
085     * @return true if this is a .gz stream, false otherwise
086     *
087     * @since 1.1
088     */
089    public static boolean matches(final byte[] signature, final int length) {
090        return length >= 2 && signature[0] == 31 && signature[1] == -117;
091    }
092
093    private static byte[] readToNull(final DataInput inData) throws IOException {
094        try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
095            int b;
096            while ((b = inData.readUnsignedByte()) != 0) { // NOPMD NOSONAR
097                bos.write(b);
098            }
099            return bos.toByteArray();
100        }
101    }
102
103    private final CountingInputStream countingStream;
104
105    // Compressed input stream, possibly wrapped in a
106    // BufferedInputStream, always wrapped in countingStream above
107    private final InputStream in;
108
109    // True if decompressing multi member streams.
110    private final boolean decompressConcatenated;
111
112    // Buffer to hold the input data
113    private final byte[] buf = new byte[8192];
114
115    // Amount of data in buf.
116    private int bufUsed;
117
118    // Decompressor
119    private Inflater inf = new Inflater(true);
120
121    // CRC32 from uncompressed data
122    private final CRC32 crc = new CRC32();
123
124    // True once everything has been decompressed
125    private boolean endReached;
126
127    // used in no-arg read method
128    private final byte[] oneByte = new byte[1];
129
130    private final GzipParameters parameters = new GzipParameters();
131
132    /**
133     * Constructs a new input stream that decompresses gzip-compressed data from the specified input stream.
134     * <p>
135     * This is equivalent to {@code GzipCompressorInputStream(inputStream, false)} and thus will not decompress concatenated .gz files.
136     *
137     * @param inputStream the InputStream from which this object should be created of
138     *
139     * @throws IOException if the stream could not be created
140     */
141    public GzipCompressorInputStream(final InputStream inputStream) throws IOException {
142        this(inputStream, false);
143    }
144
145    /**
146     * Constructs a new input stream that decompresses gzip-compressed data from the specified input stream.
147     * <p>
148     * If {@code decompressConcatenated} is {@code false}: This decompressor might read more input than it will actually use. If {@code inputStream} supports
149     * {@code mark} and {@code reset}, then the input position will be adjusted so that it is right after the last byte of the compressed stream. If
150     * {@code mark} isn't supported, the input position will be undefined.
151     *
152     * @param inputStream            the InputStream from which this object should be created of
153     * @param decompressConcatenated if true, decompress until the end of the input; if false, stop after the first .gz member
154     *
155     * @throws IOException if the stream could not be created
156     */
157    public GzipCompressorInputStream(final InputStream inputStream, final boolean decompressConcatenated) throws IOException {
158        countingStream = new CountingInputStream(inputStream);
159        // Mark support is strictly needed for concatenated files only,
160        // but it's simpler if it is always available.
161        if (countingStream.markSupported()) {
162            in = countingStream;
163        } else {
164            in = new BufferedInputStream(countingStream);
165        }
166
167        this.decompressConcatenated = decompressConcatenated;
168        init(true);
169    }
170
171    /**
172     * Closes the input stream (unless it is System.in).
173     *
174     * @since 1.2
175     */
176    @Override
177    public void close() throws IOException {
178        if (inf != null) {
179            inf.end();
180            inf = null;
181        }
182
183        if (this.in != System.in) {
184            this.in.close();
185        }
186    }
187
188    /**
189     * @since 1.17
190     */
191    @Override
192    public long getCompressedCount() {
193        return countingStream.getByteCount();
194    }
195
196    /**
197     * Provides the stream's meta data - may change with each stream when decompressing concatenated streams.
198     *
199     * @return the stream's meta data
200     * @since 1.8
201     */
202    public GzipParameters getMetaData() {
203        return parameters;
204    }
205
206    private boolean init(final boolean isFirstMember) throws IOException {
207        assert isFirstMember || decompressConcatenated;
208
209        // Check the magic bytes without a possibility of EOFException.
210        final int magic0 = in.read();
211
212        // If end of input was reached after decompressing at least
213        // one .gz member, we have reached the end of the file successfully.
214        if (magic0 == -1 && !isFirstMember) {
215            return false;
216        }
217
218        if (magic0 != 31 || in.read() != 139) {
219            throw new IOException(isFirstMember ? "Input is not in the .gz format" : "Garbage after a valid .gz stream");
220        }
221
222        // Parsing the rest of the header may throw EOFException.
223        final DataInput inData = new DataInputStream(in);
224        final int method = inData.readUnsignedByte();
225        if (method != Deflater.DEFLATED) {
226            throw new IOException("Unsupported compression method " + method + " in the .gz header");
227        }
228
229        final int flg = inData.readUnsignedByte();
230        if ((flg & FRESERVED) != 0) {
231            throw new IOException("Reserved flags are set in the .gz header");
232        }
233
234        parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4) * 1000);
235        switch (inData.readUnsignedByte()) { // extra flags
236        case 2:
237            parameters.setCompressionLevel(Deflater.BEST_COMPRESSION);
238            break;
239        case 4:
240            parameters.setCompressionLevel(Deflater.BEST_SPEED);
241            break;
242        default:
243            // ignored for now
244            break;
245        }
246        parameters.setOperatingSystem(inData.readUnsignedByte());
247
248        // Extra field, ignored
249        if ((flg & FEXTRA) != 0) {
250            int xlen = inData.readUnsignedByte();
251            xlen |= inData.readUnsignedByte() << 8;
252
253            // This isn't as efficient as calling in.skip would be,
254            // but it's lazier to handle unexpected end of input this way.
255            // Most files don't have an extra field anyway.
256            while (xlen-- > 0) {
257                inData.readUnsignedByte();
258            }
259        }
260
261        // Original file name
262        if ((flg & FNAME) != 0) {
263            parameters.setFileName(new String(readToNull(inData), GzipUtils.GZIP_ENCODING));
264        }
265
266        // Comment
267        if ((flg & FCOMMENT) != 0) {
268            parameters.setComment(new String(readToNull(inData), GzipUtils.GZIP_ENCODING));
269        }
270
271        // Header "CRC16" which is actually a truncated CRC32 (which isn't
272        // as good as real CRC16). I don't know if any encoder implementation
273        // sets this, so it's not worth trying to verify it. GNU gzip 1.4
274        // doesn't support this field, but zlib seems to be able to at least
275        // skip over it.
276        if ((flg & FHCRC) != 0) {
277            inData.readShort();
278        }
279
280        // Reset
281        inf.reset();
282        crc.reset();
283
284        return true;
285    }
286
287    @Override
288    public int read() throws IOException {
289        return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
290    }
291
292    /**
293     * {@inheritDoc}
294     *
295     * @since 1.1
296     */
297    @Override
298    public int read(final byte[] b, int off, int len) throws IOException {
299        if (len == 0) {
300            return 0;
301        }
302        if (endReached) {
303            return -1;
304        }
305
306        int size = 0;
307
308        while (len > 0) {
309            if (inf.needsInput()) {
310                // Remember the current position because we may need to
311                // rewind after reading too much input.
312                in.mark(buf.length);
313
314                bufUsed = in.read(buf);
315                if (bufUsed == -1) {
316                    throw new EOFException();
317                }
318
319                inf.setInput(buf, 0, bufUsed);
320            }
321
322            final int ret;
323            try {
324                ret = inf.inflate(b, off, len);
325            } catch (final DataFormatException e) { // NOSONAR
326                throw new IOException("Gzip-compressed data is corrupt");
327            }
328
329            crc.update(b, off, ret);
330            off += ret;
331            len -= ret;
332            size += ret;
333            count(ret);
334
335            if (inf.finished()) {
336                // We may have read too many bytes. Rewind the read
337                // position to match the actual amount used.
338                in.reset();
339
340                final int skipAmount = bufUsed - inf.getRemaining();
341                if (org.apache.commons.io.IOUtils.skip(in, skipAmount) != skipAmount) {
342                    throw new IOException();
343                }
344
345                bufUsed = 0;
346
347                final DataInput inData = new DataInputStream(in);
348
349                // CRC32
350                final long crcStored = ByteUtils.fromLittleEndian(inData, 4);
351
352                if (crcStored != crc.getValue()) {
353                    throw new IOException("Gzip-compressed data is corrupt " + "(CRC32 error)");
354                }
355
356                // Uncompressed size modulo 2^32 (ISIZE in the spec)
357                final long isize = ByteUtils.fromLittleEndian(inData, 4);
358
359                if (isize != (inf.getBytesWritten() & 0xffffffffL)) {
360                    throw new IOException("Gzip-compressed data is corrupt" + "(uncompressed size mismatch)");
361                }
362
363                // See if this is the end of the file.
364                if (!decompressConcatenated || !init(false)) {
365                    inf.end();
366                    inf = null;
367                    endReached = true;
368                    return size == 0 ? -1 : size;
369                }
370            }
371        }
372
373        return size;
374    }
375}