001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.gzip; 020 021import java.io.BufferedInputStream; 022import java.io.ByteArrayOutputStream; 023import java.io.DataInput; 024import java.io.DataInputStream; 025import java.io.EOFException; 026import java.io.IOException; 027import java.io.InputStream; 028import java.util.zip.CRC32; 029import java.util.zip.DataFormatException; 030import java.util.zip.Deflater; 031import java.util.zip.Inflater; 032 033import org.apache.commons.compress.compressors.CompressorInputStream; 034import org.apache.commons.compress.utils.ByteUtils; 035import org.apache.commons.compress.utils.InputStreamStatistics; 036import org.apache.commons.io.input.BoundedInputStream; 037 038/** 039 * Input stream that decompresses .gz files. 040 * 041 * <p> 042 * This supports decompressing concatenated .gz files which is important when decompressing standalone .gz files. 043 * </p> 044 * 045 * <p> 046 * Instead of using {@code java.util.zip.GZIPInputStream}, this class has its own GZIP member decoder. 047 * The actual decompression is done with {@link java.util.zip.Inflater}. 048 * </p> 049 * 050 * <p> 051 * If you use the constructor {@code GzipCompressorInputStream(in)} or {@code GzipCompressorInputStream(in, false)}, 052 * then {@link #read} will return -1 as soon as the first encoded GZIP member has been completely read. In this case, 053 * if the underlying input stream supports {@link InputStream#mark mark()} and {@link InputStream#reset reset()}, 054 * then it will be left positioned just after the end of the encoded GZIP member; otherwise, some indeterminate number 055 * of extra bytes following the encoded GZIP member will have been consumed and discarded. 056 * </p> 057 * 058 * <p> 059 * If you use the constructor {@code GzipCompressorInputStream(in, true)} then {@link #read} will return -1 only after 060 * the entire input stream has been exhausted; any bytes that follow an encoded GZIP member must constitute a new encoded 061 * GZIP member, otherwise an {@link IOException} is thrown. The data read from a stream constructed this way will consist 062 * of the concatenated data of all of the encoded GZIP members in order. 063 * </p> 064 * 065 * @see "https://tools.ietf.org/html/rfc1952" 066 */ 067public class GzipCompressorInputStream extends CompressorInputStream implements InputStreamStatistics { 068 069 // Header flags 070 // private static final int FTEXT = 0x01; // Uninteresting for us 071 private static final int FHCRC = 0x02; 072 private static final int FEXTRA = 0x04; 073 private static final int FNAME = 0x08; 074 private static final int FCOMMENT = 0x10; 075 private static final int FRESERVED = 0xE0; 076 077 /** 078 * Checks if the signature matches what is expected for a .gz file. 079 * 080 * @param signature the bytes to check 081 * @param length the number of bytes to check 082 * @return true if this is a .gz stream, false otherwise 083 * 084 * @since 1.1 085 */ 086 public static boolean matches(final byte[] signature, final int length) { 087 return length >= 2 && signature[0] == 31 && signature[1] == -117; 088 } 089 090 private static byte[] readToNull(final DataInput inData) throws IOException { 091 try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) { 092 int b; 093 while ((b = inData.readUnsignedByte()) != 0) { // NOPMD NOSONAR 094 bos.write(b); 095 } 096 return bos.toByteArray(); 097 } 098 } 099 100 private final BoundedInputStream countingStream; 101 102 // Compressed input stream, possibly wrapped in a 103 // BufferedInputStream, always wrapped in countingStream above 104 private final InputStream in; 105 106 // True if decompressing multi member streams. 107 private final boolean decompressConcatenated; 108 109 // Buffer to hold the input data 110 private final byte[] buf = new byte[8192]; 111 112 // Amount of data in buf. 113 private int bufUsed; 114 115 // Decompressor 116 private Inflater inf = new Inflater(true); 117 118 // CRC32 from uncompressed data 119 private final CRC32 crc = new CRC32(); 120 121 // True once everything has been decompressed 122 private boolean endReached; 123 124 // used in no-arg read method 125 private final byte[] oneByte = new byte[1]; 126 127 private final GzipParameters parameters = new GzipParameters(); 128 129 /** 130 * Constructs a new input stream that decompresses gzip-compressed data from the specified input stream. 131 * <p> 132 * This is equivalent to {@code GzipCompressorInputStream(inputStream, false)} and thus will not decompress concatenated .gz files. 133 * 134 * @param inputStream the InputStream from which this object should be created of 135 * 136 * @throws IOException if the stream could not be created 137 */ 138 public GzipCompressorInputStream(final InputStream inputStream) throws IOException { 139 this(inputStream, false); 140 } 141 142 /** 143 * Constructs a new input stream that decompresses gzip-compressed data from the specified input stream. 144 * <p> 145 * If {@code decompressConcatenated} is {@code false}: This decompressor might read more input than it will actually use. If {@code inputStream} supports 146 * {@code mark} and {@code reset}, then the input position will be adjusted so that it is right after the last byte of the compressed stream. If 147 * {@code mark} isn't supported, the input position will be undefined. 148 * 149 * @param inputStream the InputStream from which this object should be created of 150 * @param decompressConcatenated if true, decompress until the end of the input; if false, stop after the first .gz member 151 * 152 * @throws IOException if the stream could not be created 153 */ 154 public GzipCompressorInputStream(final InputStream inputStream, final boolean decompressConcatenated) throws IOException { 155 countingStream = BoundedInputStream.builder().setInputStream(inputStream).get(); 156 // Mark support is strictly needed for concatenated files only, 157 // but it's simpler if it is always available. 158 if (countingStream.markSupported()) { 159 in = countingStream; 160 } else { 161 in = new BufferedInputStream(countingStream); 162 } 163 164 this.decompressConcatenated = decompressConcatenated; 165 init(true); 166 } 167 168 /** 169 * Closes the input stream (unless it is System.in). 170 * 171 * @since 1.2 172 */ 173 @Override 174 public void close() throws IOException { 175 if (inf != null) { 176 inf.end(); 177 inf = null; 178 } 179 180 if (this.in != System.in) { 181 this.in.close(); 182 } 183 } 184 185 /** 186 * @since 1.17 187 */ 188 @Override 189 public long getCompressedCount() { 190 return countingStream.getCount(); 191 } 192 193 /** 194 * Provides the stream's meta data - may change with each stream when decompressing concatenated streams. 195 * 196 * @return the stream's meta data 197 * @since 1.8 198 */ 199 public GzipParameters getMetaData() { 200 return parameters; 201 } 202 203 private boolean init(final boolean isFirstMember) throws IOException { 204 if (!isFirstMember && !decompressConcatenated) { // at least one must be true 205 throw new IllegalStateException("Unexpected: isFirstMember and decompressConcatenated are both false!"); 206 } 207 208 // Check the magic bytes without a possibility of EOFException. 209 final int magic0 = in.read(); 210 211 // If end of input was reached after decompressing at least 212 // one .gz member, we have reached the end of the file successfully. 213 if (magic0 == -1 && !isFirstMember) { 214 return false; 215 } 216 217 if (magic0 != 31 || in.read() != 139) { 218 throw new IOException(isFirstMember ? "Input is not in the .gz format" : "Garbage after a valid .gz stream"); 219 } 220 221 // Parsing the rest of the header may throw EOFException. 222 final DataInput inData = new DataInputStream(in); 223 final int method = inData.readUnsignedByte(); 224 if (method != Deflater.DEFLATED) { 225 throw new IOException("Unsupported compression method " + method + " in the .gz header"); 226 } 227 228 final int flg = inData.readUnsignedByte(); 229 if ((flg & FRESERVED) != 0) { 230 throw new IOException("Reserved flags are set in the .gz header"); 231 } 232 233 parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4) * 1000); 234 switch (inData.readUnsignedByte()) { // extra flags 235 case 2: 236 parameters.setCompressionLevel(Deflater.BEST_COMPRESSION); 237 break; 238 case 4: 239 parameters.setCompressionLevel(Deflater.BEST_SPEED); 240 break; 241 default: 242 // ignored for now 243 break; 244 } 245 parameters.setOperatingSystem(inData.readUnsignedByte()); 246 247 // Extra field, ignored 248 if ((flg & FEXTRA) != 0) { 249 int xlen = inData.readUnsignedByte(); 250 xlen |= inData.readUnsignedByte() << 8; 251 252 // This isn't as efficient as calling in.skip would be, 253 // but it's lazier to handle unexpected end of input this way. 254 // Most files don't have an extra field anyway. 255 while (xlen-- > 0) { 256 inData.readUnsignedByte(); 257 } 258 } 259 260 // Original file name 261 if ((flg & FNAME) != 0) { 262 parameters.setFileName(new String(readToNull(inData), GzipUtils.GZIP_ENCODING)); 263 } 264 265 // Comment 266 if ((flg & FCOMMENT) != 0) { 267 parameters.setComment(new String(readToNull(inData), GzipUtils.GZIP_ENCODING)); 268 } 269 270 // Header "CRC16" which is actually a truncated CRC32 (which isn't 271 // as good as real CRC16). I don't know if any encoder implementation 272 // sets this, so it's not worth trying to verify it. GNU gzip 1.4 273 // doesn't support this field, but zlib seems to be able to at least 274 // skip over it. 275 if ((flg & FHCRC) != 0) { 276 inData.readShort(); 277 } 278 279 // Reset 280 inf.reset(); 281 crc.reset(); 282 283 return true; 284 } 285 286 @Override 287 public int read() throws IOException { 288 return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF; 289 } 290 291 /** 292 * {@inheritDoc} 293 * 294 * @since 1.1 295 */ 296 @Override 297 public int read(final byte[] b, int off, int len) throws IOException { 298 if (len == 0) { 299 return 0; 300 } 301 if (endReached) { 302 return -1; 303 } 304 305 int size = 0; 306 307 while (len > 0) { 308 if (inf.needsInput()) { 309 // Remember the current position because we may need to 310 // rewind after reading too much input. 311 in.mark(buf.length); 312 313 bufUsed = in.read(buf); 314 if (bufUsed == -1) { 315 throw new EOFException(); 316 } 317 318 inf.setInput(buf, 0, bufUsed); 319 } 320 321 final int ret; 322 try { 323 ret = inf.inflate(b, off, len); 324 } catch (final DataFormatException e) { // NOSONAR 325 throw new IOException("Gzip-compressed data is corrupt"); 326 } 327 328 crc.update(b, off, ret); 329 off += ret; 330 len -= ret; 331 size += ret; 332 count(ret); 333 334 if (inf.finished()) { 335 // We may have read too many bytes. Rewind the read 336 // position to match the actual amount used. 337 in.reset(); 338 339 final int skipAmount = bufUsed - inf.getRemaining(); 340 if (org.apache.commons.io.IOUtils.skip(in, skipAmount) != skipAmount) { 341 throw new IOException(); 342 } 343 344 bufUsed = 0; 345 346 final DataInput inData = new DataInputStream(in); 347 348 // CRC32 349 final long crcStored = ByteUtils.fromLittleEndian(inData, 4); 350 351 if (crcStored != crc.getValue()) { 352 throw new IOException("Gzip-compressed data is corrupt " + "(CRC32 error)"); 353 } 354 355 // Uncompressed size modulo 2^32 (ISIZE in the spec) 356 final long isize = ByteUtils.fromLittleEndian(inData, 4); 357 358 if (isize != (inf.getBytesWritten() & 0xffffffffL)) { 359 throw new IOException("Gzip-compressed data is corrupt" + "(uncompressed size mismatch)"); 360 } 361 362 // See if this is the end of the file. 363 if (!decompressConcatenated || !init(false)) { 364 inf.end(); 365 inf = null; 366 endReached = true; 367 return size == 0 ? -1 : size; 368 } 369 } 370 } 371 372 return size; 373 } 374}