001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * https://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.gzip; 020 021import java.io.BufferedInputStream; 022import java.io.ByteArrayOutputStream; 023import java.io.DataInput; 024import java.io.DataInputStream; 025import java.io.EOFException; 026import java.io.IOException; 027import java.io.InputStream; 028import java.nio.charset.Charset; 029import java.util.zip.CRC32; 030import java.util.zip.DataFormatException; 031import java.util.zip.Deflater; 032import java.util.zip.Inflater; 033 034import org.apache.commons.compress.compressors.CompressorInputStream; 035import org.apache.commons.compress.utils.ByteUtils; 036import org.apache.commons.compress.utils.InputStreamStatistics; 037import org.apache.commons.io.IOUtils; 038import org.apache.commons.io.build.AbstractOrigin; 039import org.apache.commons.io.build.AbstractStreamBuilder; 040import org.apache.commons.io.function.IOConsumer; 041import org.apache.commons.io.input.BoundedInputStream; 042 043/** 044 * Input stream that decompresses GZIP (.gz) files. 045 * 046 * <p> 047 * This supports decompressing concatenated GZIP files which is important when decompressing standalone GZIP files. 048 * </p> 049 * <p> 050 * Instead of using {@code java.util.zip.GZIPInputStream}, this class has its own GZIP member decoder. Internally, decompression is done using 051 * {@link java.util.zip.Inflater}. 052 * </p> 053 * <p> 054 * If you use the constructor {@code GzipCompressorInputStream(in)}, {@code Builder.setDecompressConcatenated(false)}, or 055 * {@code GzipCompressorInputStream(in, false)}, then {@link #read} will return -1 as soon as the first encoded GZIP member has been completely read. In this 056 * case, if the underlying input stream supports {@link InputStream#mark mark()} and {@link InputStream#reset reset()}, then it will be left positioned just 057 * after the end of the encoded GZIP member; otherwise, some indeterminate number of extra bytes following the encoded GZIP member will have been consumed and 058 * discarded. 059 * </p> 060 * <p> 061 * If you use the {@code Builder.setDecompressConcatenated(true)} or {@code GzipCompressorInputStream(in, true)} then {@link #read} will return -1 only after 062 * the entire input stream has been exhausted; any bytes that follow an encoded GZIP member must constitute a new encoded GZIP member, otherwise an 063 * {@link IOException} is thrown. The data read from a stream constructed this way will consist of the concatenated data of all of the encoded GZIP members in 064 * order. 065 * </p> 066 * <p> 067 * To build an instance, use {@link Builder}. 068 * </p> 069 * 070 * @see Builder 071 * @see <a href="https://datatracker.ietf.org/doc/html/rfc1952">RFC 1952 GZIP File Format Specification</a> 072 */ 073public class GzipCompressorInputStream extends CompressorInputStream implements InputStreamStatistics { 074 075 // @formatter:off 076 /** 077 * Builds a new {@link GzipCompressorInputStream}. 078 * 079 * <p> 080 * For example: 081 * </p> 082 * <pre>{@code 083 * GzipCompressorInputStream s = GzipCompressorInputStream.builder() 084 * .setPath(path) 085 * .setFileNameCharset(StandardCharsets.ISO_8859_1) 086 * .get();} 087 * </pre> 088 * 089 * @see #get() 090 * @since 1.28.0 091 */ 092 // @formatter:on 093 public static class Builder extends AbstractStreamBuilder<GzipCompressorInputStream, Builder> { 094 095 /** True if decompressing multi-member streams. */ 096 private boolean decompressConcatenated; 097 098 private Charset fileNameCharset = GzipUtils.GZIP_ENCODING; 099 100 private IOConsumer<GzipCompressorInputStream> onMemberStart; 101 102 private IOConsumer<GzipCompressorInputStream> onMemberEnd; 103 104 /** 105 * Constructs a new builder of {@link GzipCompressorInputStream}. 106 */ 107 public Builder() { 108 // empty 109 } 110 111 /** 112 * Builds a new {@link GzipCompressorInputStream}. 113 * <p> 114 * You must set input that supports {@link InputStream}, otherwise, this method throws an exception. 115 * </p> 116 * 117 * @return a new instance. 118 * @throws IllegalStateException if the {@code origin} is {@code null}. 119 * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}. 120 * @see AbstractOrigin#getInputStream(java.nio.file.OpenOption...) 121 */ 122 @Override 123 public GzipCompressorInputStream get() throws IOException { 124 return new GzipCompressorInputStream(this); 125 } 126 127 /** 128 * Sets whether we should allow decompressing multiple members. 129 * 130 * @param decompressConcatenated whether we should allow decompressing multiple members. 131 * @return this instance. 132 */ 133 public Builder setDecompressConcatenated(final boolean decompressConcatenated) { 134 this.decompressConcatenated = decompressConcatenated; 135 return this; 136 } 137 138 /** 139 * Sets the Charset to use for writing file names and comments, where null maps to {@link GzipUtils#GZIP_ENCODING}. 140 * <p> 141 * <em>Setting a value other than {@link GzipUtils#GZIP_ENCODING} is not compliant with the <a href="https://datatracker.ietf.org/doc/html/rfc1952">RFC 142 * 1952 GZIP File Format Specification</a></em>. Use at your own risk of interoperability issues. 143 * </p> 144 * <p> 145 * The default value is {@link GzipUtils#GZIP_ENCODING}. 146 * </p> 147 * 148 * @param fileNameCharset the Charset to use for writing file names and comments, null maps to {@link GzipUtils#GZIP_ENCODING}. 149 * @return this instance. 150 */ 151 public Builder setFileNameCharset(final Charset fileNameCharset) { 152 this.fileNameCharset = fileNameCharset; 153 return this; 154 } 155 156 /** 157 * Sets the consumer called when a member <em>trailer</em> is parsed. 158 * <p> 159 * When a member <em>header</em> is parsed, all {@link GzipParameters} values are initialized except {@code trailerCrc} and {@code trailerISize}. 160 * </p> 161 * <p> 162 * When a member <em>trailer</em> is parsed, the {@link GzipParameters} values {@code trailerCrc} and {@code trailerISize} are set. 163 * </p> 164 * 165 * @param onMemberEnd The consumer. 166 * @return this instance. 167 * @see GzipCompressorInputStream#getMetaData() 168 */ 169 public Builder setOnMemberEnd(final IOConsumer<GzipCompressorInputStream> onMemberEnd) { 170 this.onMemberEnd = onMemberEnd; 171 return this; 172 } 173 174 /** 175 * Sets the consumer called when a member <em>header</em> is parsed. 176 * <p> 177 * When a member <em>header</em> is parsed, all {@link GzipParameters} values are initialized except {@code trailerCrc} and {@code trailerISize}. 178 * </p> 179 * <p> 180 * When a member <em>trailer</em> is parsed, the {@link GzipParameters} values {@code trailerCrc} and {@code trailerISize} are set. 181 * </p> 182 * 183 * @param onMemberStart The consumer. 184 * @return this instance. 185 * @see GzipCompressorInputStream#getMetaData() 186 */ 187 public Builder setOnMemberStart(final IOConsumer<GzipCompressorInputStream> onMemberStart) { 188 this.onMemberStart = onMemberStart; 189 return this; 190 } 191 } 192 193 private static final IOConsumer<GzipCompressorInputStream> NOOP = IOConsumer.noop(); 194 195 /** 196 * Constructs a new builder of {@link GzipCompressorInputStream}. 197 * 198 * @return a new builder of {@link GzipCompressorInputStream}. 199 * @since 1.28.0 200 */ 201 public static Builder builder() { 202 return new Builder(); 203 } 204 205 /** 206 * Checks if the signature matches what is expected for a .gz file. 207 * 208 * @param signature the bytes to check 209 * @param length the number of bytes to check 210 * @return true if this is a .gz stream, false otherwise 211 * @since 1.1 212 */ 213 public static boolean matches(final byte[] signature, final int length) { 214 return length >= 2 && signature[0] == 31 && signature[1] == -117; 215 } 216 217 private static byte[] readToNull(final DataInput inData) throws IOException { 218 try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) { 219 int b; 220 while ((b = inData.readUnsignedByte()) != 0) { // NOSONAR 221 bos.write(b); 222 } 223 return bos.toByteArray(); 224 } 225 } 226 227 /** Buffer to hold the input data. */ 228 private final byte[] buf = new byte[8192]; 229 230 /** Amount of data in buf. */ 231 private int bufUsed; 232 233 private final BoundedInputStream countingStream; 234 235 /** CRC32 from uncompressed data. */ 236 private final CRC32 crc = new CRC32(); 237 238 /** True if decompressing multi-member streams. */ 239 private final boolean decompressConcatenated; 240 241 /** True once everything has been decompressed. */ 242 private boolean endReached; 243 244 private final Charset fileNameCharset; 245 246 /** 247 * Compressed input stream, possibly wrapped in a BufferedInputStream, always wrapped in countingStream above 248 */ 249 private final InputStream in; 250 251 /** Decompressor. */ 252 private Inflater inflater = new Inflater(true); 253 254 /** Buffer for no-argument read method. */ 255 private final byte[] oneByte = new byte[1]; 256 257 private GzipParameters parameters; 258 259 private final IOConsumer<GzipCompressorInputStream> onMemberStart; 260 261 private final IOConsumer<GzipCompressorInputStream> onMemberEnd; 262 263 @SuppressWarnings("resource") // caller closes 264 private GzipCompressorInputStream(final Builder builder) throws IOException { 265 countingStream = BoundedInputStream.builder().setInputStream(builder.getInputStream()).get(); 266 // Mark support is strictly needed for concatenated files only, 267 // but it's simpler if it is always available. 268 in = countingStream.markSupported() ? countingStream : new BufferedInputStream(countingStream); 269 this.decompressConcatenated = builder.decompressConcatenated; 270 this.fileNameCharset = builder.fileNameCharset; 271 this.onMemberStart = builder.onMemberStart != null ? builder.onMemberStart : NOOP; 272 this.onMemberEnd = builder.onMemberEnd != null ? builder.onMemberEnd : NOOP; 273 init(true); 274 } 275 276 /** 277 * Constructs a new input stream that decompresses gzip-compressed data from the specified input stream. 278 * <p> 279 * This is equivalent to {@code GzipCompressorInputStream(inputStream, false)} and thus will not decompress concatenated .gz files. 280 * </p> 281 * 282 * @param inputStream the InputStream from which this object should be created of 283 * @throws IOException if the stream could not be created 284 */ 285 public GzipCompressorInputStream(final InputStream inputStream) throws IOException { 286 this(builder().setInputStream(inputStream)); 287 } 288 289 /** 290 * Constructs a new input stream that decompresses gzip-compressed data from the specified input stream. 291 * <p> 292 * If {@code decompressConcatenated} is {@code false}: This decompressor might read more input than it will actually use. If {@code inputStream} supports 293 * {@code mark} and {@code reset}, then the input position will be adjusted so that it is right after the last byte of the compressed stream. If 294 * {@code mark} isn't supported, the input position will be undefined. 295 * </p> 296 * 297 * @param inputStream the InputStream from which this object should be created of 298 * @param decompressConcatenated if true, decompress until the end of the input; if false, stop after the first .gz member 299 * @throws IOException if the stream could not be created 300 * @deprecated Use {@link Builder#get()}. 301 */ 302 @Deprecated 303 public GzipCompressorInputStream(final InputStream inputStream, final boolean decompressConcatenated) throws IOException { 304 this(builder().setInputStream(inputStream).setDecompressConcatenated(decompressConcatenated)); 305 } 306 307 /** 308 * Closes the input stream (unless it is System.in). 309 * 310 * @since 1.2 311 */ 312 @Override 313 public void close() throws IOException { 314 if (inflater != null) { 315 inflater.end(); 316 inflater = null; 317 } 318 if (this.in != System.in) { 319 this.in.close(); 320 } 321 } 322 323 /** 324 * {@inheritDoc}. 325 * 326 * @since 1.17 327 */ 328 @Override 329 public long getCompressedCount() { 330 return countingStream.getCount(); 331 } 332 333 /** 334 * Provides the stream's meta data - may change with each stream when decompressing concatenated streams. 335 * 336 * @return the stream's meta data 337 * @since 1.8 338 */ 339 public GzipParameters getMetaData() { 340 return parameters; 341 } 342 343 private boolean init(final boolean isFirstMember) throws IOException { 344 if (!isFirstMember && !decompressConcatenated) { // at least one must be true 345 throw new IllegalStateException("Unexpected: isFirstMember and decompressConcatenated are both false."); 346 } 347 // Check the magic bytes without a possibility of EOFException. 348 final int magic0 = in.read(); 349 // If end of input was reached after decompressing at least 350 // one .gz member, we have reached the end of the file successfully. 351 if (magic0 == -1 && !isFirstMember) { 352 return false; 353 } 354 if (magic0 != GzipUtils.ID1 || in.read() != GzipUtils.ID2) { 355 throw new IOException(isFirstMember ? "Input is not in the .gz format." : "Unexpected data after a valid .gz stream."); 356 } 357 parameters = new GzipParameters(); 358 parameters.setFileNameCharset(fileNameCharset); 359 // Parsing the rest of the header may throw EOFException. 360 final DataInput inData = new DataInputStream(in); 361 final int method = inData.readUnsignedByte(); 362 if (method != Deflater.DEFLATED) { 363 throw new IOException("Unsupported compression method " + method + " in the .gz header"); 364 } 365 final int flg = inData.readUnsignedByte(); 366 if ((flg & GzipUtils.FRESERVED) != 0) { 367 throw new IOException("Reserved flags are set in the .gz header."); 368 } 369 parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4)); 370 switch (inData.readUnsignedByte()) { // extra flags 371 case GzipUtils.XFL_MAX_COMPRESSION: 372 parameters.setCompressionLevel(Deflater.BEST_COMPRESSION); 373 break; 374 case GzipUtils.XFL_MAX_SPEED: 375 parameters.setCompressionLevel(Deflater.BEST_SPEED); 376 break; 377 default: 378 parameters.setCompressionLevel(Deflater.DEFAULT_COMPRESSION); 379 break; 380 } 381 parameters.setOperatingSystem(inData.readUnsignedByte()); 382 // Extra field 383 if ((flg & GzipUtils.FEXTRA) != 0) { 384 int xlen = inData.readUnsignedByte(); 385 xlen |= inData.readUnsignedByte() << 8; 386 final byte[] extra = new byte[xlen]; 387 inData.readFully(extra); 388 parameters.setExtraField(ExtraField.fromBytes(extra)); 389 } 390 // Original file name 391 if ((flg & GzipUtils.FNAME) != 0) { 392 parameters.setFileName(new String(readToNull(inData), parameters.getFileNameCharset())); 393 } 394 // Comment 395 if ((flg & GzipUtils.FCOMMENT) != 0) { 396 parameters.setComment(new String(readToNull(inData), parameters.getFileNameCharset())); 397 } 398 // Header "CRC16" which is actually a truncated CRC32 (which isn't 399 // as good as real CRC16). I don't know if any encoder implementation 400 // sets this, so it's not worth trying to verify it. GNU gzip 1.4 401 // doesn't support this field, but zlib seems to be able to at least 402 // skip over it. 403 if ((flg & GzipUtils.FHCRC) != 0) { 404 parameters.setHeaderCRC(true); 405 inData.readShort(); 406 } 407 // Reset 408 inflater.reset(); 409 crc.reset(); 410 onMemberStart.accept(this); 411 return true; 412 } 413 414 @Override 415 public int read() throws IOException { 416 return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF; 417 } 418 419 /** 420 * {@inheritDoc} 421 * 422 * @since 1.1 423 */ 424 @Override 425 public int read(final byte[] b, int off, int len) throws IOException { 426 if (len == 0) { 427 return 0; 428 } 429 if (endReached) { 430 return -1; 431 } 432 433 int size = 0; 434 435 while (len > 0) { 436 if (inflater.needsInput()) { 437 // Remember the current position because we may need to 438 // rewind after reading too much input. 439 in.mark(buf.length); 440 441 bufUsed = in.read(buf); 442 if (bufUsed == -1) { 443 throw new EOFException(); 444 } 445 446 inflater.setInput(buf, 0, bufUsed); 447 } 448 449 final int ret; 450 try { 451 ret = inflater.inflate(b, off, len); 452 } catch (final DataFormatException e) { // NOSONAR 453 throw new IOException("Gzip-compressed data is corrupt.", e); 454 } 455 456 crc.update(b, off, ret); 457 off += ret; 458 len -= ret; 459 size += ret; 460 count(ret); 461 462 if (inflater.finished()) { 463 // We may have read too many bytes. Rewind the read 464 // position to match the actual amount used. 465 in.reset(); 466 final int skipAmount = bufUsed - inflater.getRemaining(); 467 if (IOUtils.skip(in, skipAmount) != skipAmount) { 468 throw new IOException(); 469 } 470 bufUsed = 0; 471 final DataInput inData = new DataInputStream(in); 472 // CRC32 473 final long trailerCrc = ByteUtils.fromLittleEndian(inData, 4); 474 if (trailerCrc != crc.getValue()) { 475 throw new IOException("Gzip-compressed data is corrupt (CRC32 error)."); 476 } 477 // Uncompressed size modulo 2^32, ISIZE in the RFC. 478 final long iSize = ByteUtils.fromLittleEndian(inData, 4); 479 if (iSize != (inflater.getBytesWritten() & 0xffffffffL)) { 480 throw new IOException("Gzip-compressed data is corrupt (uncompressed size mismatch)."); 481 } 482 parameters.setTrailerCrc(trailerCrc); 483 parameters.setTrailerISize(iSize); 484 onMemberEnd.accept(this); 485 // See if this is the end of the file. 486 if (!decompressConcatenated || !init(false)) { 487 inflater.end(); 488 inflater = null; 489 endReached = true; 490 return size == 0 ? -1 : size; 491 } 492 } 493 } 494 495 return size; 496 } 497}