001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.gzip; 020 021import java.io.ByteArrayOutputStream; 022import java.io.IOException; 023import java.io.EOFException; 024import java.io.InputStream; 025import java.io.DataInput; 026import java.io.DataInputStream; 027import java.io.BufferedInputStream; 028import java.util.zip.DataFormatException; 029import java.util.zip.Deflater; 030import java.util.zip.Inflater; 031import java.util.zip.CRC32; 032 033import org.apache.commons.compress.compressors.CompressorInputStream; 034import org.apache.commons.compress.utils.ByteUtils; 035import org.apache.commons.compress.utils.CharsetNames; 036 037/** 038 * Input stream that decompresses .gz files. 039 * This supports decompressing concatenated .gz files which is important 040 * when decompressing standalone .gz files. 041 * <p> 042 * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz 043 * files: it stops after the first member and silently ignores the rest. 044 * It doesn't leave the read position to point to the beginning of the next 045 * member, which makes it difficult workaround the lack of concatenation 046 * support. 047 * <p> 048 * Instead of using <code>GZIPInputStream</code>, this class has its own .gz 049 * container format decoder. The actual decompression is done with 050 * {@link java.util.zip.Inflater}. 051 */ 052public class GzipCompressorInputStream extends CompressorInputStream { 053 // Header flags 054 // private static final int FTEXT = 0x01; // Uninteresting for us 055 private static final int FHCRC = 0x02; 056 private static final int FEXTRA = 0x04; 057 private static final int FNAME = 0x08; 058 private static final int FCOMMENT = 0x10; 059 private static final int FRESERVED = 0xE0; 060 061 // Compressed input stream, possibly wrapped in a BufferedInputStream 062 private final InputStream in; 063 064 // True if decompressing multi member streams. 065 private final boolean decompressConcatenated; 066 067 // Buffer to hold the input data 068 private final byte[] buf = new byte[8192]; 069 070 // Amount of data in buf. 071 private int bufUsed; 072 073 // Decompressor 074 private Inflater inf = new Inflater(true); 075 076 // CRC32 from uncompressed data 077 private final CRC32 crc = new CRC32(); 078 079 // True once everything has been decompressed 080 private boolean endReached = false; 081 082 // used in no-arg read method 083 private final byte[] oneByte = new byte[1]; 084 085 private final GzipParameters parameters = new GzipParameters(); 086 087 /** 088 * Constructs a new input stream that decompresses gzip-compressed data 089 * from the specified input stream. 090 * <p> 091 * This is equivalent to 092 * <code>GzipCompressorInputStream(inputStream, false)</code> and thus 093 * will not decompress concatenated .gz files. 094 * 095 * @param inputStream the InputStream from which this object should 096 * be created of 097 * 098 * @throws IOException if the stream could not be created 099 */ 100 public GzipCompressorInputStream(final InputStream inputStream) 101 throws IOException { 102 this(inputStream, false); 103 } 104 105 /** 106 * Constructs a new input stream that decompresses gzip-compressed data 107 * from the specified input stream. 108 * <p> 109 * If <code>decompressConcatenated</code> is {@code false}: 110 * This decompressor might read more input than it will actually use. 111 * If <code>inputStream</code> supports <code>mark</code> and 112 * <code>reset</code>, then the input position will be adjusted 113 * so that it is right after the last byte of the compressed stream. 114 * If <code>mark</code> isn't supported, the input position will be 115 * undefined. 116 * 117 * @param inputStream the InputStream from which this object should 118 * be created of 119 * @param decompressConcatenated 120 * if true, decompress until the end of the input; 121 * if false, stop after the first .gz member 122 * 123 * @throws IOException if the stream could not be created 124 */ 125 public GzipCompressorInputStream(final InputStream inputStream, 126 final boolean decompressConcatenated) 127 throws IOException { 128 // Mark support is strictly needed for concatenated files only, 129 // but it's simpler if it is always available. 130 if (inputStream.markSupported()) { 131 in = inputStream; 132 } else { 133 in = new BufferedInputStream(inputStream); 134 } 135 136 this.decompressConcatenated = decompressConcatenated; 137 init(true); 138 } 139 140 /** 141 * Provides the stream's meta data - may change with each stream 142 * when decompressing concatenated streams. 143 * @return the stream's meta data 144 * @since 1.8 145 */ 146 public GzipParameters getMetaData() { 147 return parameters; 148 } 149 150 private boolean init(final boolean isFirstMember) throws IOException { 151 assert isFirstMember || decompressConcatenated; 152 153 // Check the magic bytes without a possibility of EOFException. 154 final int magic0 = in.read(); 155 final int magic1 = in.read(); 156 157 // If end of input was reached after decompressing at least 158 // one .gz member, we have reached the end of the file successfully. 159 if (magic0 == -1 && !isFirstMember) { 160 return false; 161 } 162 163 if (magic0 != 31 || magic1 != 139) { 164 throw new IOException(isFirstMember 165 ? "Input is not in the .gz format" 166 : "Garbage after a valid .gz stream"); 167 } 168 169 // Parsing the rest of the header may throw EOFException. 170 final DataInput inData = new DataInputStream(in); 171 final int method = inData.readUnsignedByte(); 172 if (method != Deflater.DEFLATED) { 173 throw new IOException("Unsupported compression method " 174 + method + " in the .gz header"); 175 } 176 177 final int flg = inData.readUnsignedByte(); 178 if ((flg & FRESERVED) != 0) { 179 throw new IOException( 180 "Reserved flags are set in the .gz header"); 181 } 182 183 parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4) * 1000); 184 switch (inData.readUnsignedByte()) { // extra flags 185 case 2: 186 parameters.setCompressionLevel(Deflater.BEST_COMPRESSION); 187 break; 188 case 4: 189 parameters.setCompressionLevel(Deflater.BEST_SPEED); 190 break; 191 default: 192 // ignored for now 193 break; 194 } 195 parameters.setOperatingSystem(inData.readUnsignedByte()); 196 197 // Extra field, ignored 198 if ((flg & FEXTRA) != 0) { 199 int xlen = inData.readUnsignedByte(); 200 xlen |= inData.readUnsignedByte() << 8; 201 202 // This isn't as efficient as calling in.skip would be, 203 // but it's lazier to handle unexpected end of input this way. 204 // Most files don't have an extra field anyway. 205 while (xlen-- > 0) { 206 inData.readUnsignedByte(); 207 } 208 } 209 210 // Original file name 211 if ((flg & FNAME) != 0) { 212 parameters.setFilename(new String(readToNull(inData), 213 CharsetNames.ISO_8859_1)); 214 } 215 216 // Comment 217 if ((flg & FCOMMENT) != 0) { 218 parameters.setComment(new String(readToNull(inData), 219 CharsetNames.ISO_8859_1)); 220 } 221 222 // Header "CRC16" which is actually a truncated CRC32 (which isn't 223 // as good as real CRC16). I don't know if any encoder implementation 224 // sets this, so it's not worth trying to verify it. GNU gzip 1.4 225 // doesn't support this field, but zlib seems to be able to at least 226 // skip over it. 227 if ((flg & FHCRC) != 0) { 228 inData.readShort(); 229 } 230 231 // Reset 232 inf.reset(); 233 crc.reset(); 234 235 return true; 236 } 237 238 private static byte[] readToNull(final DataInput inData) throws IOException { 239 final ByteArrayOutputStream bos = new ByteArrayOutputStream(); 240 int b = 0; 241 while ((b = inData.readUnsignedByte()) != 0x00) { // NOPMD 242 bos.write(b); 243 } 244 return bos.toByteArray(); 245 } 246 247 @Override 248 public int read() throws IOException { 249 return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF; 250 } 251 252 /** 253 * {@inheritDoc} 254 * 255 * @since 1.1 256 */ 257 @Override 258 public int read(final byte[] b, int off, int len) throws IOException { 259 if (endReached) { 260 return -1; 261 } 262 263 int size = 0; 264 265 while (len > 0) { 266 if (inf.needsInput()) { 267 // Remember the current position because we may need to 268 // rewind after reading too much input. 269 in.mark(buf.length); 270 271 bufUsed = in.read(buf); 272 if (bufUsed == -1) { 273 throw new EOFException(); 274 } 275 276 inf.setInput(buf, 0, bufUsed); 277 } 278 279 int ret; 280 try { 281 ret = inf.inflate(b, off, len); 282 } catch (final DataFormatException e) { 283 throw new IOException("Gzip-compressed data is corrupt"); 284 } 285 286 crc.update(b, off, ret); 287 off += ret; 288 len -= ret; 289 size += ret; 290 count(ret); 291 292 if (inf.finished()) { 293 // We may have read too many bytes. Rewind the read 294 // position to match the actual amount used. 295 // 296 // NOTE: The "if" is there just in case. Since we used 297 // in.mark earlier, it should always skip enough. 298 in.reset(); 299 300 final int skipAmount = bufUsed - inf.getRemaining(); 301 if (in.skip(skipAmount) != skipAmount) { 302 throw new IOException(); 303 } 304 305 bufUsed = 0; 306 307 final DataInput inData = new DataInputStream(in); 308 309 // CRC32 310 final long crcStored = ByteUtils.fromLittleEndian(inData, 4); 311 312 if (crcStored != crc.getValue()) { 313 throw new IOException("Gzip-compressed data is corrupt " 314 + "(CRC32 error)"); 315 } 316 317 // Uncompressed size modulo 2^32 (ISIZE in the spec) 318 final long isize = ByteUtils.fromLittleEndian(inData, 4); 319 320 if (isize != (inf.getBytesWritten() & 0xffffffffl)) { 321 throw new IOException("Gzip-compressed data is corrupt" 322 + "(uncompressed size mismatch)"); 323 } 324 325 // See if this is the end of the file. 326 if (!decompressConcatenated || !init(false)) { 327 inf.end(); 328 inf = null; 329 endReached = true; 330 return size == 0 ? -1 : size; 331 } 332 } 333 } 334 335 return size; 336 } 337 338 /** 339 * Checks if the signature matches what is expected for a .gz file. 340 * 341 * @param signature the bytes to check 342 * @param length the number of bytes to check 343 * @return true if this is a .gz stream, false otherwise 344 * 345 * @since 1.1 346 */ 347 public static boolean matches(final byte[] signature, final int length) { 348 349 if (length < 2) { 350 return false; 351 } 352 353 if (signature[0] != 31) { 354 return false; 355 } 356 357 if (signature[1] != -117) { 358 return false; 359 } 360 361 return true; 362 } 363 364 /** 365 * Closes the input stream (unless it is System.in). 366 * 367 * @since 1.2 368 */ 369 @Override 370 public void close() throws IOException { 371 if (inf != null) { 372 inf.end(); 373 inf = null; 374 } 375 376 if (this.in != System.in) { 377 this.in.close(); 378 } 379 } 380}