001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.gzip; 020 021import java.io.BufferedInputStream; 022import java.io.ByteArrayOutputStream; 023import java.io.DataInput; 024import java.io.DataInputStream; 025import java.io.EOFException; 026import java.io.IOException; 027import java.io.InputStream; 028import java.util.zip.CRC32; 029import java.util.zip.DataFormatException; 030import java.util.zip.Deflater; 031import java.util.zip.Inflater; 032 033import org.apache.commons.compress.compressors.CompressorInputStream; 034import org.apache.commons.compress.utils.ByteUtils; 035import org.apache.commons.compress.utils.InputStreamStatistics; 036import org.apache.commons.io.input.CountingInputStream; 037 038/** 039 * Input stream that decompresses .gz files. 040 * 041 * <p> 042 * This supports decompressing concatenated .gz files which is important when decompressing standalone .gz files. 043 * </p> 044 * 045 * <p> 046 * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz files: it stops after the first member and silently ignores the rest. It doesn't 047 * leave the read position to point to the beginning of the next member, which makes it difficult workaround the lack of concatenation support. 048 * </p> 049 * 050 * <p> 051 * Instead of using {@code GZIPInputStream}, this class has its own .gz container format decoder. The actual decompression is done with 052 * {@link java.util.zip.Inflater}. 053 * </p> 054 * 055 * <p> 056 * If you use the constructor {@code GzipCompressorInputStream(in)} or {@code GzipCompressorInputStream(in, false)} with some {@code 057 * InputStream} {@code in} then {@link #read} will return -1 as soon as the first internal member has been read completely. The stream {@code in} will be 058 * positioned at the start of the second gzip member if there is one. 059 * </p> 060 * 061 * <p> 062 * If you use the constructor {@code GzipCompressorInputStream(in, 063 * true)} with some {@code InputStream} {@code in} then {@link #read} will return -1 once the stream {@code in} has been exhausted. The data read from a stream 064 * constructed this way will consist of the concatenated data of all gzip members contained inside {@code 065 * in}. 066 * </p> 067 * 068 * @see "https://tools.ietf.org/html/rfc1952" 069 */ 070public class GzipCompressorInputStream extends CompressorInputStream implements InputStreamStatistics { 071 072 // Header flags 073 // private static final int FTEXT = 0x01; // Uninteresting for us 074 private static final int FHCRC = 0x02; 075 private static final int FEXTRA = 0x04; 076 private static final int FNAME = 0x08; 077 private static final int FCOMMENT = 0x10; 078 private static final int FRESERVED = 0xE0; 079 080 /** 081 * Checks if the signature matches what is expected for a .gz file. 082 * 083 * @param signature the bytes to check 084 * @param length the number of bytes to check 085 * @return true if this is a .gz stream, false otherwise 086 * 087 * @since 1.1 088 */ 089 public static boolean matches(final byte[] signature, final int length) { 090 return length >= 2 && signature[0] == 31 && signature[1] == -117; 091 } 092 093 private static byte[] readToNull(final DataInput inData) throws IOException { 094 try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) { 095 int b; 096 while ((b = inData.readUnsignedByte()) != 0) { // NOPMD NOSONAR 097 bos.write(b); 098 } 099 return bos.toByteArray(); 100 } 101 } 102 103 private final CountingInputStream countingStream; 104 105 // Compressed input stream, possibly wrapped in a 106 // BufferedInputStream, always wrapped in countingStream above 107 private final InputStream in; 108 109 // True if decompressing multi member streams. 110 private final boolean decompressConcatenated; 111 112 // Buffer to hold the input data 113 private final byte[] buf = new byte[8192]; 114 115 // Amount of data in buf. 116 private int bufUsed; 117 118 // Decompressor 119 private Inflater inf = new Inflater(true); 120 121 // CRC32 from uncompressed data 122 private final CRC32 crc = new CRC32(); 123 124 // True once everything has been decompressed 125 private boolean endReached; 126 127 // used in no-arg read method 128 private final byte[] oneByte = new byte[1]; 129 130 private final GzipParameters parameters = new GzipParameters(); 131 132 /** 133 * Constructs a new input stream that decompresses gzip-compressed data from the specified input stream. 134 * <p> 135 * This is equivalent to {@code GzipCompressorInputStream(inputStream, false)} and thus will not decompress concatenated .gz files. 136 * 137 * @param inputStream the InputStream from which this object should be created of 138 * 139 * @throws IOException if the stream could not be created 140 */ 141 public GzipCompressorInputStream(final InputStream inputStream) throws IOException { 142 this(inputStream, false); 143 } 144 145 /** 146 * Constructs a new input stream that decompresses gzip-compressed data from the specified input stream. 147 * <p> 148 * If {@code decompressConcatenated} is {@code false}: This decompressor might read more input than it will actually use. If {@code inputStream} supports 149 * {@code mark} and {@code reset}, then the input position will be adjusted so that it is right after the last byte of the compressed stream. If 150 * {@code mark} isn't supported, the input position will be undefined. 151 * 152 * @param inputStream the InputStream from which this object should be created of 153 * @param decompressConcatenated if true, decompress until the end of the input; if false, stop after the first .gz member 154 * 155 * @throws IOException if the stream could not be created 156 */ 157 public GzipCompressorInputStream(final InputStream inputStream, final boolean decompressConcatenated) throws IOException { 158 countingStream = new CountingInputStream(inputStream); 159 // Mark support is strictly needed for concatenated files only, 160 // but it's simpler if it is always available. 161 if (countingStream.markSupported()) { 162 in = countingStream; 163 } else { 164 in = new BufferedInputStream(countingStream); 165 } 166 167 this.decompressConcatenated = decompressConcatenated; 168 init(true); 169 } 170 171 /** 172 * Closes the input stream (unless it is System.in). 173 * 174 * @since 1.2 175 */ 176 @Override 177 public void close() throws IOException { 178 if (inf != null) { 179 inf.end(); 180 inf = null; 181 } 182 183 if (this.in != System.in) { 184 this.in.close(); 185 } 186 } 187 188 /** 189 * @since 1.17 190 */ 191 @Override 192 public long getCompressedCount() { 193 return countingStream.getByteCount(); 194 } 195 196 /** 197 * Provides the stream's meta data - may change with each stream when decompressing concatenated streams. 198 * 199 * @return the stream's meta data 200 * @since 1.8 201 */ 202 public GzipParameters getMetaData() { 203 return parameters; 204 } 205 206 private boolean init(final boolean isFirstMember) throws IOException { 207 assert isFirstMember || decompressConcatenated; 208 209 // Check the magic bytes without a possibility of EOFException. 210 final int magic0 = in.read(); 211 212 // If end of input was reached after decompressing at least 213 // one .gz member, we have reached the end of the file successfully. 214 if (magic0 == -1 && !isFirstMember) { 215 return false; 216 } 217 218 if (magic0 != 31 || in.read() != 139) { 219 throw new IOException(isFirstMember ? "Input is not in the .gz format" : "Garbage after a valid .gz stream"); 220 } 221 222 // Parsing the rest of the header may throw EOFException. 223 final DataInput inData = new DataInputStream(in); 224 final int method = inData.readUnsignedByte(); 225 if (method != Deflater.DEFLATED) { 226 throw new IOException("Unsupported compression method " + method + " in the .gz header"); 227 } 228 229 final int flg = inData.readUnsignedByte(); 230 if ((flg & FRESERVED) != 0) { 231 throw new IOException("Reserved flags are set in the .gz header"); 232 } 233 234 parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4) * 1000); 235 switch (inData.readUnsignedByte()) { // extra flags 236 case 2: 237 parameters.setCompressionLevel(Deflater.BEST_COMPRESSION); 238 break; 239 case 4: 240 parameters.setCompressionLevel(Deflater.BEST_SPEED); 241 break; 242 default: 243 // ignored for now 244 break; 245 } 246 parameters.setOperatingSystem(inData.readUnsignedByte()); 247 248 // Extra field, ignored 249 if ((flg & FEXTRA) != 0) { 250 int xlen = inData.readUnsignedByte(); 251 xlen |= inData.readUnsignedByte() << 8; 252 253 // This isn't as efficient as calling in.skip would be, 254 // but it's lazier to handle unexpected end of input this way. 255 // Most files don't have an extra field anyway. 256 while (xlen-- > 0) { 257 inData.readUnsignedByte(); 258 } 259 } 260 261 // Original file name 262 if ((flg & FNAME) != 0) { 263 parameters.setFileName(new String(readToNull(inData), GzipUtils.GZIP_ENCODING)); 264 } 265 266 // Comment 267 if ((flg & FCOMMENT) != 0) { 268 parameters.setComment(new String(readToNull(inData), GzipUtils.GZIP_ENCODING)); 269 } 270 271 // Header "CRC16" which is actually a truncated CRC32 (which isn't 272 // as good as real CRC16). I don't know if any encoder implementation 273 // sets this, so it's not worth trying to verify it. GNU gzip 1.4 274 // doesn't support this field, but zlib seems to be able to at least 275 // skip over it. 276 if ((flg & FHCRC) != 0) { 277 inData.readShort(); 278 } 279 280 // Reset 281 inf.reset(); 282 crc.reset(); 283 284 return true; 285 } 286 287 @Override 288 public int read() throws IOException { 289 return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF; 290 } 291 292 /** 293 * {@inheritDoc} 294 * 295 * @since 1.1 296 */ 297 @Override 298 public int read(final byte[] b, int off, int len) throws IOException { 299 if (len == 0) { 300 return 0; 301 } 302 if (endReached) { 303 return -1; 304 } 305 306 int size = 0; 307 308 while (len > 0) { 309 if (inf.needsInput()) { 310 // Remember the current position because we may need to 311 // rewind after reading too much input. 312 in.mark(buf.length); 313 314 bufUsed = in.read(buf); 315 if (bufUsed == -1) { 316 throw new EOFException(); 317 } 318 319 inf.setInput(buf, 0, bufUsed); 320 } 321 322 final int ret; 323 try { 324 ret = inf.inflate(b, off, len); 325 } catch (final DataFormatException e) { // NOSONAR 326 throw new IOException("Gzip-compressed data is corrupt"); 327 } 328 329 crc.update(b, off, ret); 330 off += ret; 331 len -= ret; 332 size += ret; 333 count(ret); 334 335 if (inf.finished()) { 336 // We may have read too many bytes. Rewind the read 337 // position to match the actual amount used. 338 in.reset(); 339 340 final int skipAmount = bufUsed - inf.getRemaining(); 341 if (org.apache.commons.io.IOUtils.skip(in, skipAmount) != skipAmount) { 342 throw new IOException(); 343 } 344 345 bufUsed = 0; 346 347 final DataInput inData = new DataInputStream(in); 348 349 // CRC32 350 final long crcStored = ByteUtils.fromLittleEndian(inData, 4); 351 352 if (crcStored != crc.getValue()) { 353 throw new IOException("Gzip-compressed data is corrupt " + "(CRC32 error)"); 354 } 355 356 // Uncompressed size modulo 2^32 (ISIZE in the spec) 357 final long isize = ByteUtils.fromLittleEndian(inData, 4); 358 359 if (isize != (inf.getBytesWritten() & 0xffffffffL)) { 360 throw new IOException("Gzip-compressed data is corrupt" + "(uncompressed size mismatch)"); 361 } 362 363 // See if this is the end of the file. 364 if (!decompressConcatenated || !init(false)) { 365 inf.end(); 366 inf = null; 367 endReached = true; 368 return size == 0 ? -1 : size; 369 } 370 } 371 } 372 373 return size; 374 } 375}