GzipCompressorInputStream.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one
  3.  * or more contributor license agreements.  See the NOTICE file
  4.  * distributed with this work for additional information
  5.  * regarding copyright ownership.  The ASF licenses this file
  6.  * to you under the Apache License, Version 2.0 (the
  7.  * "License"); you may not use this file except in compliance
  8.  * with the License.  You may obtain a copy of the License at
  9.  *
  10.  * http://www.apache.org/licenses/LICENSE-2.0
  11.  *
  12.  * Unless required by applicable law or agreed to in writing,
  13.  * software distributed under the License is distributed on an
  14.  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  15.  * KIND, either express or implied.  See the License for the
  16.  * specific language governing permissions and limitations
  17.  * under the License.
  18.  */
  19. package org.apache.commons.compress.compressors.gzip;

  20. import java.io.BufferedInputStream;
  21. import java.io.ByteArrayOutputStream;
  22. import java.io.DataInput;
  23. import java.io.DataInputStream;
  24. import java.io.EOFException;
  25. import java.io.IOException;
  26. import java.io.InputStream;
  27. import java.util.zip.CRC32;
  28. import java.util.zip.DataFormatException;
  29. import java.util.zip.Deflater;
  30. import java.util.zip.Inflater;

  31. import org.apache.commons.compress.compressors.CompressorInputStream;
  32. import org.apache.commons.compress.utils.ByteUtils;
  33. import org.apache.commons.compress.utils.InputStreamStatistics;
  34. import org.apache.commons.io.input.BoundedInputStream;

  35. /**
  36.  * Input stream that decompresses .gz files.
  37.  *
  38.  * <p>
  39.  * This supports decompressing concatenated .gz files which is important when decompressing standalone .gz files.
  40.  * </p>
  41.  *
  42.  * <p>
  43.  * Instead of using {@code java.util.zip.GZIPInputStream}, this class has its own GZIP member decoder.
  44.  * The actual decompression is done with {@link java.util.zip.Inflater}.
  45.  * </p>
  46.  *
  47.  * <p>
  48.  * If you use the constructor {@code GzipCompressorInputStream(in)} or {@code GzipCompressorInputStream(in, false)},
  49.  * then {@link #read} will return -1 as soon as the first encoded GZIP member has been completely read. In this case,
  50.  * if the underlying input stream supports {@link InputStream#mark mark()} and {@link InputStream#reset reset()},
  51.  * then it will be left positioned just after the end of the encoded GZIP member; otherwise, some indeterminate number
  52.  * of extra bytes following the encoded GZIP member will have been consumed and discarded.
  53.  * </p>
  54.  *
  55.  * <p>
  56.  * If you use the constructor {@code GzipCompressorInputStream(in, true)} then {@link #read} will return -1 only after
  57.  * the entire input stream has been exhausted; any bytes that follow an encoded GZIP member must constitute a new encoded
  58.  * GZIP member, otherwise an {@link IOException} is thrown. The data read from a stream constructed this way will consist
  59.  * of the concatenated data of all of the encoded GZIP members in order.
  60.  * </p>
  61.  *
  62.  * @see "https://tools.ietf.org/html/rfc1952"
  63.  */
  64. public class GzipCompressorInputStream extends CompressorInputStream implements InputStreamStatistics {

  65.     // Header flags
  66.     // private static final int FTEXT = 0x01; // Uninteresting for us
  67.     private static final int FHCRC = 0x02;
  68.     private static final int FEXTRA = 0x04;
  69.     private static final int FNAME = 0x08;
  70.     private static final int FCOMMENT = 0x10;
  71.     private static final int FRESERVED = 0xE0;

  72.     /**
  73.      * Checks if the signature matches what is expected for a .gz file.
  74.      *
  75.      * @param signature the bytes to check
  76.      * @param length    the number of bytes to check
  77.      * @return true if this is a .gz stream, false otherwise
  78.      *
  79.      * @since 1.1
  80.      */
  81.     public static boolean matches(final byte[] signature, final int length) {
  82.         return length >= 2 && signature[0] == 31 && signature[1] == -117;
  83.     }

  84.     private static byte[] readToNull(final DataInput inData) throws IOException {
  85.         try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
  86.             int b;
  87.             while ((b = inData.readUnsignedByte()) != 0) { // NOPMD NOSONAR
  88.                 bos.write(b);
  89.             }
  90.             return bos.toByteArray();
  91.         }
  92.     }

  93.     private final BoundedInputStream countingStream;

  94.     // Compressed input stream, possibly wrapped in a
  95.     // BufferedInputStream, always wrapped in countingStream above
  96.     private final InputStream in;

  97.     // True if decompressing multi member streams.
  98.     private final boolean decompressConcatenated;

  99.     // Buffer to hold the input data
  100.     private final byte[] buf = new byte[8192];

  101.     // Amount of data in buf.
  102.     private int bufUsed;

  103.     // Decompressor
  104.     private Inflater inf = new Inflater(true);

  105.     // CRC32 from uncompressed data
  106.     private final CRC32 crc = new CRC32();

  107.     // True once everything has been decompressed
  108.     private boolean endReached;

  109.     // used in no-arg read method
  110.     private final byte[] oneByte = new byte[1];

  111.     private final GzipParameters parameters = new GzipParameters();

  112.     /**
  113.      * Constructs a new input stream that decompresses gzip-compressed data from the specified input stream.
  114.      * <p>
  115.      * This is equivalent to {@code GzipCompressorInputStream(inputStream, false)} and thus will not decompress concatenated .gz files.
  116.      *
  117.      * @param inputStream the InputStream from which this object should be created of
  118.      *
  119.      * @throws IOException if the stream could not be created
  120.      */
  121.     public GzipCompressorInputStream(final InputStream inputStream) throws IOException {
  122.         this(inputStream, false);
  123.     }

  124.     /**
  125.      * Constructs a new input stream that decompresses gzip-compressed data from the specified input stream.
  126.      * <p>
  127.      * If {@code decompressConcatenated} is {@code false}: This decompressor might read more input than it will actually use. If {@code inputStream} supports
  128.      * {@code mark} and {@code reset}, then the input position will be adjusted so that it is right after the last byte of the compressed stream. If
  129.      * {@code mark} isn't supported, the input position will be undefined.
  130.      *
  131.      * @param inputStream            the InputStream from which this object should be created of
  132.      * @param decompressConcatenated if true, decompress until the end of the input; if false, stop after the first .gz member
  133.      *
  134.      * @throws IOException if the stream could not be created
  135.      */
  136.     public GzipCompressorInputStream(final InputStream inputStream, final boolean decompressConcatenated) throws IOException {
  137.         countingStream = BoundedInputStream.builder().setInputStream(inputStream).get();
  138.         // Mark support is strictly needed for concatenated files only,
  139.         // but it's simpler if it is always available.
  140.         if (countingStream.markSupported()) {
  141.             in = countingStream;
  142.         } else {
  143.             in = new BufferedInputStream(countingStream);
  144.         }

  145.         this.decompressConcatenated = decompressConcatenated;
  146.         init(true);
  147.     }

  148.     /**
  149.      * Closes the input stream (unless it is System.in).
  150.      *
  151.      * @since 1.2
  152.      */
  153.     @Override
  154.     public void close() throws IOException {
  155.         if (inf != null) {
  156.             inf.end();
  157.             inf = null;
  158.         }

  159.         if (this.in != System.in) {
  160.             this.in.close();
  161.         }
  162.     }

  163.     /**
  164.      * @since 1.17
  165.      */
  166.     @Override
  167.     public long getCompressedCount() {
  168.         return countingStream.getCount();
  169.     }

  170.     /**
  171.      * Provides the stream's meta data - may change with each stream when decompressing concatenated streams.
  172.      *
  173.      * @return the stream's meta data
  174.      * @since 1.8
  175.      */
  176.     public GzipParameters getMetaData() {
  177.         return parameters;
  178.     }

  179.     private boolean init(final boolean isFirstMember) throws IOException {
  180.         if (!isFirstMember && !decompressConcatenated) { // at least one must be true
  181.             throw new IllegalStateException("Unexpected: isFirstMember and decompressConcatenated are both false!");
  182.         }

  183.         // Check the magic bytes without a possibility of EOFException.
  184.         final int magic0 = in.read();

  185.         // If end of input was reached after decompressing at least
  186.         // one .gz member, we have reached the end of the file successfully.
  187.         if (magic0 == -1 && !isFirstMember) {
  188.             return false;
  189.         }

  190.         if (magic0 != 31 || in.read() != 139) {
  191.             throw new IOException(isFirstMember ? "Input is not in the .gz format" : "Garbage after a valid .gz stream");
  192.         }

  193.         // Parsing the rest of the header may throw EOFException.
  194.         final DataInput inData = new DataInputStream(in);
  195.         final int method = inData.readUnsignedByte();
  196.         if (method != Deflater.DEFLATED) {
  197.             throw new IOException("Unsupported compression method " + method + " in the .gz header");
  198.         }

  199.         final int flg = inData.readUnsignedByte();
  200.         if ((flg & FRESERVED) != 0) {
  201.             throw new IOException("Reserved flags are set in the .gz header");
  202.         }

  203.         parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4) * 1000);
  204.         switch (inData.readUnsignedByte()) { // extra flags
  205.         case 2:
  206.             parameters.setCompressionLevel(Deflater.BEST_COMPRESSION);
  207.             break;
  208.         case 4:
  209.             parameters.setCompressionLevel(Deflater.BEST_SPEED);
  210.             break;
  211.         default:
  212.             // ignored for now
  213.             break;
  214.         }
  215.         parameters.setOperatingSystem(inData.readUnsignedByte());

  216.         // Extra field, ignored
  217.         if ((flg & FEXTRA) != 0) {
  218.             int xlen = inData.readUnsignedByte();
  219.             xlen |= inData.readUnsignedByte() << 8;

  220.             // This isn't as efficient as calling in.skip would be,
  221.             // but it's lazier to handle unexpected end of input this way.
  222.             // Most files don't have an extra field anyway.
  223.             while (xlen-- > 0) {
  224.                 inData.readUnsignedByte();
  225.             }
  226.         }

  227.         // Original file name
  228.         if ((flg & FNAME) != 0) {
  229.             parameters.setFileName(new String(readToNull(inData), GzipUtils.GZIP_ENCODING));
  230.         }

  231.         // Comment
  232.         if ((flg & FCOMMENT) != 0) {
  233.             parameters.setComment(new String(readToNull(inData), GzipUtils.GZIP_ENCODING));
  234.         }

  235.         // Header "CRC16" which is actually a truncated CRC32 (which isn't
  236.         // as good as real CRC16). I don't know if any encoder implementation
  237.         // sets this, so it's not worth trying to verify it. GNU gzip 1.4
  238.         // doesn't support this field, but zlib seems to be able to at least
  239.         // skip over it.
  240.         if ((flg & FHCRC) != 0) {
  241.             inData.readShort();
  242.         }

  243.         // Reset
  244.         inf.reset();
  245.         crc.reset();

  246.         return true;
  247.     }

  248.     @Override
  249.     public int read() throws IOException {
  250.         return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
  251.     }

  252.     /**
  253.      * {@inheritDoc}
  254.      *
  255.      * @since 1.1
  256.      */
  257.     @Override
  258.     public int read(final byte[] b, int off, int len) throws IOException {
  259.         if (len == 0) {
  260.             return 0;
  261.         }
  262.         if (endReached) {
  263.             return -1;
  264.         }

  265.         int size = 0;

  266.         while (len > 0) {
  267.             if (inf.needsInput()) {
  268.                 // Remember the current position because we may need to
  269.                 // rewind after reading too much input.
  270.                 in.mark(buf.length);

  271.                 bufUsed = in.read(buf);
  272.                 if (bufUsed == -1) {
  273.                     throw new EOFException();
  274.                 }

  275.                 inf.setInput(buf, 0, bufUsed);
  276.             }

  277.             final int ret;
  278.             try {
  279.                 ret = inf.inflate(b, off, len);
  280.             } catch (final DataFormatException e) { // NOSONAR
  281.                 throw new IOException("Gzip-compressed data is corrupt");
  282.             }

  283.             crc.update(b, off, ret);
  284.             off += ret;
  285.             len -= ret;
  286.             size += ret;
  287.             count(ret);

  288.             if (inf.finished()) {
  289.                 // We may have read too many bytes. Rewind the read
  290.                 // position to match the actual amount used.
  291.                 in.reset();

  292.                 final int skipAmount = bufUsed - inf.getRemaining();
  293.                 if (org.apache.commons.io.IOUtils.skip(in, skipAmount) != skipAmount) {
  294.                     throw new IOException();
  295.                 }

  296.                 bufUsed = 0;

  297.                 final DataInput inData = new DataInputStream(in);

  298.                 // CRC32
  299.                 final long crcStored = ByteUtils.fromLittleEndian(inData, 4);

  300.                 if (crcStored != crc.getValue()) {
  301.                     throw new IOException("Gzip-compressed data is corrupt " + "(CRC32 error)");
  302.                 }

  303.                 // Uncompressed size modulo 2^32 (ISIZE in the spec)
  304.                 final long isize = ByteUtils.fromLittleEndian(inData, 4);

  305.                 if (isize != (inf.getBytesWritten() & 0xffffffffL)) {
  306.                     throw new IOException("Gzip-compressed data is corrupt" + "(uncompressed size mismatch)");
  307.                 }

  308.                 // See if this is the end of the file.
  309.                 if (!decompressConcatenated || !init(false)) {
  310.                     inf.end();
  311.                     inf = null;
  312.                     endReached = true;
  313.                     return size == 0 ? -1 : size;
  314.                 }
  315.             }
  316.         }

  317.         return size;
  318.     }
  319. }