001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *   https://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.gzip;
020
021import java.io.BufferedInputStream;
022import java.io.ByteArrayOutputStream;
023import java.io.DataInput;
024import java.io.DataInputStream;
025import java.io.EOFException;
026import java.io.IOException;
027import java.io.InputStream;
028import java.nio.charset.Charset;
029import java.util.zip.CRC32;
030import java.util.zip.DataFormatException;
031import java.util.zip.Deflater;
032import java.util.zip.Inflater;
033
034import org.apache.commons.compress.compressors.CompressorInputStream;
035import org.apache.commons.compress.utils.ByteUtils;
036import org.apache.commons.compress.utils.InputStreamStatistics;
037import org.apache.commons.io.IOUtils;
038import org.apache.commons.io.build.AbstractOrigin;
039import org.apache.commons.io.build.AbstractStreamBuilder;
040import org.apache.commons.io.function.IOConsumer;
041import org.apache.commons.io.input.BoundedInputStream;
042
043/**
044 * Input stream that decompresses GZIP (.gz) files.
045 *
046 * <p>
047 * This supports decompressing concatenated GZIP files which is important when decompressing standalone GZIP files.
048 * </p>
049 * <p>
050 * Instead of using {@code java.util.zip.GZIPInputStream}, this class has its own GZIP member decoder. Internally, decompression is done using
051 * {@link java.util.zip.Inflater}.
052 * </p>
053 * <p>
054 * If you use the constructor {@code GzipCompressorInputStream(in)}, {@code Builder.setDecompressConcatenated(false)}, or
055 * {@code GzipCompressorInputStream(in, false)}, then {@link #read} will return -1 as soon as the first encoded GZIP member has been completely read. In this
056 * case, if the underlying input stream supports {@link InputStream#mark mark()} and {@link InputStream#reset reset()}, then it will be left positioned just
057 * after the end of the encoded GZIP member; otherwise, some indeterminate number of extra bytes following the encoded GZIP member will have been consumed and
058 * discarded.
059 * </p>
060 * <p>
061 * If you use the {@code Builder.setDecompressConcatenated(true)} or {@code GzipCompressorInputStream(in, true)} then {@link #read} will return -1 only after
062 * the entire input stream has been exhausted; any bytes that follow an encoded GZIP member must constitute a new encoded GZIP member, otherwise an
063 * {@link IOException} is thrown. The data read from a stream constructed this way will consist of the concatenated data of all of the encoded GZIP members in
064 * order.
065 * </p>
066 * <p>
067 * To build an instance, use {@link Builder}.
068 * </p>
069 *
070 * @see Builder
071 * @see <a href="https://datatracker.ietf.org/doc/html/rfc1952">RFC 1952 GZIP File Format Specification</a>
072 */
073public class GzipCompressorInputStream extends CompressorInputStream implements InputStreamStatistics {
074
075    // @formatter:off
076    /**
077     * Builds a new {@link GzipCompressorInputStream}.
078     *
079     * <p>
080     * For example:
081     * </p>
082     * <pre>{@code
083     * GzipCompressorInputStream s = GzipCompressorInputStream.builder()
084     *   .setPath(path)
085     *   .setFileNameCharset(StandardCharsets.ISO_8859_1)
086     *   .get();}
087     * </pre>
088     *
089     * @see #get()
090     * @since 1.28.0
091     */
092    // @formatter:on
093    public static class Builder extends AbstractStreamBuilder<GzipCompressorInputStream, Builder> {
094
095        /** True if decompressing multi-member streams. */
096        private boolean decompressConcatenated;
097
098        private Charset fileNameCharset = GzipUtils.GZIP_ENCODING;
099
100        private IOConsumer<GzipCompressorInputStream> onMemberStart;
101
102        private IOConsumer<GzipCompressorInputStream> onMemberEnd;
103
104        /**
105         * Constructs a new builder of {@link GzipCompressorInputStream}.
106         */
107        public Builder() {
108            // empty
109        }
110
111        /**
112         * Builds a new {@link GzipCompressorInputStream}.
113         * <p>
114         * You must set input that supports {@link InputStream}, otherwise, this method throws an exception.
115         * </p>
116         *
117         * @return a new instance.
118         * @throws IllegalStateException         if the {@code origin} is {@code null}.
119         * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
120         * @see AbstractOrigin#getInputStream(java.nio.file.OpenOption...)
121         */
122        @Override
123        public GzipCompressorInputStream get() throws IOException {
124            return new GzipCompressorInputStream(this);
125        }
126
127        /**
128         * Sets whether we should allow decompressing multiple members.
129         *
130         * @param decompressConcatenated whether we should allow decompressing multiple members.
131         * @return this instance.
132         */
133        public Builder setDecompressConcatenated(final boolean decompressConcatenated) {
134            this.decompressConcatenated = decompressConcatenated;
135            return this;
136        }
137
138        /**
139         * Sets the Charset to use for writing file names and comments, where null maps to {@link GzipUtils#GZIP_ENCODING}.
140         * <p>
141         * <em>Setting a value other than {@link GzipUtils#GZIP_ENCODING} is not compliant with the <a href="https://datatracker.ietf.org/doc/html/rfc1952">RFC
142         * 1952 GZIP File Format Specification</a></em>. Use at your own risk of interoperability issues.
143         * </p>
144         * <p>
145         * The default value is {@link GzipUtils#GZIP_ENCODING}.
146         * </p>
147         *
148         * @param fileNameCharset the Charset to use for writing file names and comments, null maps to {@link GzipUtils#GZIP_ENCODING}.
149         * @return this instance.
150         */
151        public Builder setFileNameCharset(final Charset fileNameCharset) {
152            this.fileNameCharset = fileNameCharset;
153            return this;
154        }
155
156        /**
157         * Sets the consumer called when a member <em>trailer</em> is parsed.
158         * <p>
159         * When a member <em>header</em> is parsed, all {@link GzipParameters} values are initialized except {@code trailerCrc} and {@code trailerISize}.
160         * </p>
161         * <p>
162         * When a member <em>trailer</em> is parsed, the {@link GzipParameters} values {@code trailerCrc} and {@code trailerISize} are set.
163         * </p>
164         *
165         * @param onMemberEnd The consumer.
166         * @return this instance.
167         * @see GzipCompressorInputStream#getMetaData()
168         */
169        public Builder setOnMemberEnd(final IOConsumer<GzipCompressorInputStream> onMemberEnd) {
170            this.onMemberEnd = onMemberEnd;
171            return this;
172        }
173
174        /**
175         * Sets the consumer called when a member <em>header</em> is parsed.
176         * <p>
177         * When a member <em>header</em> is parsed, all {@link GzipParameters} values are initialized except {@code trailerCrc} and {@code trailerISize}.
178         * </p>
179         * <p>
180         * When a member <em>trailer</em> is parsed, the {@link GzipParameters} values {@code trailerCrc} and {@code trailerISize} are set.
181         * </p>
182         *
183         * @param onMemberStart The consumer.
184         * @return this instance.
185         * @see GzipCompressorInputStream#getMetaData()
186         */
187        public Builder setOnMemberStart(final IOConsumer<GzipCompressorInputStream> onMemberStart) {
188            this.onMemberStart = onMemberStart;
189            return this;
190        }
191    }
192
193    private static final IOConsumer<GzipCompressorInputStream> NOOP = IOConsumer.noop();
194
195    /**
196     * Constructs a new builder of {@link GzipCompressorInputStream}.
197     *
198     * @return a new builder of {@link GzipCompressorInputStream}.
199     * @since 1.28.0
200     */
201    public static Builder builder() {
202        return new Builder();
203    }
204
205    /**
206     * Checks if the signature matches what is expected for a .gz file.
207     *
208     * @param signature the bytes to check
209     * @param length    the number of bytes to check
210     * @return true if this is a .gz stream, false otherwise
211     * @since 1.1
212     */
213    public static boolean matches(final byte[] signature, final int length) {
214        return length >= 2 && signature[0] == 31 && signature[1] == -117;
215    }
216
217    private static byte[] readToNull(final DataInput inData) throws IOException {
218        try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
219            int b;
220            while ((b = inData.readUnsignedByte()) != 0) { // NOSONAR
221                bos.write(b);
222            }
223            return bos.toByteArray();
224        }
225    }
226
227    /** Buffer to hold the input data. */
228    private final byte[] buf = new byte[8192];
229
230    /** Amount of data in buf. */
231    private int bufUsed;
232
233    private final BoundedInputStream countingStream;
234
235    /** CRC32 from uncompressed data. */
236    private final CRC32 crc = new CRC32();
237
238    /** True if decompressing multi-member streams. */
239    private final boolean decompressConcatenated;
240
241    /** True once everything has been decompressed. */
242    private boolean endReached;
243
244    private final Charset fileNameCharset;
245
246    /**
247     * Compressed input stream, possibly wrapped in a BufferedInputStream, always wrapped in countingStream above
248     */
249    private final InputStream in;
250
251    /** Decompressor. */
252    private Inflater inflater = new Inflater(true);
253
254    /** Buffer for no-argument read method. */
255    private final byte[] oneByte = new byte[1];
256
257    private GzipParameters parameters;
258
259    private final IOConsumer<GzipCompressorInputStream> onMemberStart;
260
261    private final IOConsumer<GzipCompressorInputStream> onMemberEnd;
262
263    @SuppressWarnings("resource") // caller closes
264    private GzipCompressorInputStream(final Builder builder) throws IOException {
265        countingStream = BoundedInputStream.builder().setInputStream(builder.getInputStream()).get();
266        // Mark support is strictly needed for concatenated files only,
267        // but it's simpler if it is always available.
268        in = countingStream.markSupported() ? countingStream : new BufferedInputStream(countingStream);
269        this.decompressConcatenated = builder.decompressConcatenated;
270        this.fileNameCharset = builder.fileNameCharset;
271        this.onMemberStart = builder.onMemberStart != null ? builder.onMemberStart : NOOP;
272        this.onMemberEnd = builder.onMemberEnd != null ? builder.onMemberEnd : NOOP;
273        init(true);
274    }
275
276    /**
277     * Constructs a new input stream that decompresses gzip-compressed data from the specified input stream.
278     * <p>
279     * This is equivalent to {@code GzipCompressorInputStream(inputStream, false)} and thus will not decompress concatenated .gz files.
280     * </p>
281     *
282     * @param inputStream the InputStream from which this object should be created of
283     * @throws IOException if the stream could not be created
284     */
285    public GzipCompressorInputStream(final InputStream inputStream) throws IOException {
286        this(builder().setInputStream(inputStream));
287    }
288
289    /**
290     * Constructs a new input stream that decompresses gzip-compressed data from the specified input stream.
291     * <p>
292     * If {@code decompressConcatenated} is {@code false}: This decompressor might read more input than it will actually use. If {@code inputStream} supports
293     * {@code mark} and {@code reset}, then the input position will be adjusted so that it is right after the last byte of the compressed stream. If
294     * {@code mark} isn't supported, the input position will be undefined.
295     * </p>
296     *
297     * @param inputStream            the InputStream from which this object should be created of
298     * @param decompressConcatenated if true, decompress until the end of the input; if false, stop after the first .gz member
299     * @throws IOException if the stream could not be created
300     * @deprecated Use {@link Builder#get()}.
301     */
302    @Deprecated
303    public GzipCompressorInputStream(final InputStream inputStream, final boolean decompressConcatenated) throws IOException {
304        this(builder().setInputStream(inputStream).setDecompressConcatenated(decompressConcatenated));
305    }
306
307    /**
308     * Closes the input stream (unless it is System.in).
309     *
310     * @since 1.2
311     */
312    @Override
313    public void close() throws IOException {
314        if (inflater != null) {
315            inflater.end();
316            inflater = null;
317        }
318        if (this.in != System.in) {
319            this.in.close();
320        }
321    }
322
323    /**
324     * {@inheritDoc}.
325     *
326     * @since 1.17
327     */
328    @Override
329    public long getCompressedCount() {
330        return countingStream.getCount();
331    }
332
333    /**
334     * Provides the stream's meta data - may change with each stream when decompressing concatenated streams.
335     *
336     * @return the stream's meta data
337     * @since 1.8
338     */
339    public GzipParameters getMetaData() {
340        return parameters;
341    }
342
343    private boolean init(final boolean isFirstMember) throws IOException {
344        if (!isFirstMember && !decompressConcatenated) { // at least one must be true
345            throw new IllegalStateException("Unexpected: isFirstMember and decompressConcatenated are both false.");
346        }
347        // Check the magic bytes without a possibility of EOFException.
348        final int magic0 = in.read();
349        // If end of input was reached after decompressing at least
350        // one .gz member, we have reached the end of the file successfully.
351        if (magic0 == -1 && !isFirstMember) {
352            return false;
353        }
354        if (magic0 != GzipUtils.ID1 || in.read() != GzipUtils.ID2) {
355            throw new IOException(isFirstMember ? "Input is not in the .gz format." : "Unexpected data after a valid .gz stream.");
356        }
357        parameters = new GzipParameters();
358        parameters.setFileNameCharset(fileNameCharset);
359        // Parsing the rest of the header may throw EOFException.
360        final DataInput inData = new DataInputStream(in);
361        final int method = inData.readUnsignedByte();
362        if (method != Deflater.DEFLATED) {
363            throw new IOException("Unsupported compression method " + method + " in the .gz header");
364        }
365        final int flg = inData.readUnsignedByte();
366        if ((flg & GzipUtils.FRESERVED) != 0) {
367            throw new IOException("Reserved flags are set in the .gz header.");
368        }
369        parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4));
370        switch (inData.readUnsignedByte()) { // extra flags
371        case GzipUtils.XFL_MAX_COMPRESSION:
372            parameters.setCompressionLevel(Deflater.BEST_COMPRESSION);
373            break;
374        case GzipUtils.XFL_MAX_SPEED:
375            parameters.setCompressionLevel(Deflater.BEST_SPEED);
376            break;
377        default:
378            parameters.setCompressionLevel(Deflater.DEFAULT_COMPRESSION);
379            break;
380        }
381        parameters.setOperatingSystem(inData.readUnsignedByte());
382        // Extra field
383        if ((flg & GzipUtils.FEXTRA) != 0) {
384            int xlen = inData.readUnsignedByte();
385            xlen |= inData.readUnsignedByte() << 8;
386            final byte[] extra = new byte[xlen];
387            inData.readFully(extra);
388            parameters.setExtraField(ExtraField.fromBytes(extra));
389        }
390        // Original file name
391        if ((flg & GzipUtils.FNAME) != 0) {
392            parameters.setFileName(new String(readToNull(inData), parameters.getFileNameCharset()));
393        }
394        // Comment
395        if ((flg & GzipUtils.FCOMMENT) != 0) {
396            parameters.setComment(new String(readToNull(inData), parameters.getFileNameCharset()));
397        }
398        // Header "CRC16" which is actually a truncated CRC32 (which isn't
399        // as good as real CRC16). I don't know if any encoder implementation
400        // sets this, so it's not worth trying to verify it. GNU gzip 1.4
401        // doesn't support this field, but zlib seems to be able to at least
402        // skip over it.
403        if ((flg & GzipUtils.FHCRC) != 0) {
404            parameters.setHeaderCRC(true);
405            inData.readShort();
406        }
407        // Reset
408        inflater.reset();
409        crc.reset();
410        onMemberStart.accept(this);
411        return true;
412    }
413
414    @Override
415    public int read() throws IOException {
416        return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
417    }
418
419    /**
420     * {@inheritDoc}
421     *
422     * @since 1.1
423     */
424    @Override
425    public int read(final byte[] b, int off, int len) throws IOException {
426        if (len == 0) {
427            return 0;
428        }
429        if (endReached) {
430            return -1;
431        }
432
433        int size = 0;
434
435        while (len > 0) {
436            if (inflater.needsInput()) {
437                // Remember the current position because we may need to
438                // rewind after reading too much input.
439                in.mark(buf.length);
440
441                bufUsed = in.read(buf);
442                if (bufUsed == -1) {
443                    throw new EOFException();
444                }
445
446                inflater.setInput(buf, 0, bufUsed);
447            }
448
449            final int ret;
450            try {
451                ret = inflater.inflate(b, off, len);
452            } catch (final DataFormatException e) { // NOSONAR
453                throw new IOException("Gzip-compressed data is corrupt.", e);
454            }
455
456            crc.update(b, off, ret);
457            off += ret;
458            len -= ret;
459            size += ret;
460            count(ret);
461
462            if (inflater.finished()) {
463                // We may have read too many bytes. Rewind the read
464                // position to match the actual amount used.
465                in.reset();
466                final int skipAmount = bufUsed - inflater.getRemaining();
467                if (IOUtils.skip(in, skipAmount) != skipAmount) {
468                    throw new IOException();
469                }
470                bufUsed = 0;
471                final DataInput inData = new DataInputStream(in);
472                // CRC32
473                final long trailerCrc = ByteUtils.fromLittleEndian(inData, 4);
474                if (trailerCrc != crc.getValue()) {
475                    throw new IOException("Gzip-compressed data is corrupt (CRC32 error).");
476                }
477                // Uncompressed size modulo 2^32, ISIZE in the RFC.
478                final long iSize = ByteUtils.fromLittleEndian(inData, 4);
479                if (iSize != (inflater.getBytesWritten() & 0xffffffffL)) {
480                    throw new IOException("Gzip-compressed data is corrupt (uncompressed size mismatch).");
481                }
482                parameters.setTrailerCrc(trailerCrc);
483                parameters.setTrailerISize(iSize);
484                onMemberEnd.accept(this);
485                // See if this is the end of the file.
486                if (!decompressConcatenated || !init(false)) {
487                    inflater.end();
488                    inflater = null;
489                    endReached = true;
490                    return size == 0 ? -1 : size;
491                }
492            }
493        }
494
495        return size;
496    }
497}